From: michaelt Date: Tue, 4 Feb 2014 23:29:15 +0000 (-0500) Subject: finished draft of codec lens, ascii and iso8859 are not codecs, so they are given... X-Git-Url: https://git.immae.eu/?a=commitdiff_plain;h=9018941435a48aa5437981dfdb1377aa14b13159;hp=4cbc92cc93073d1a5b99a03ad802f710d0205994;p=github%2Ffretlink%2Ftext-pipes.git finished draft of codec lens, ascii and iso8859 are not codecs, so they are given special functions --- diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 9ed0d78..18ec8ec 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -123,8 +123,23 @@ module Pipes.Text ( , group , word , line + + -- * Decoding Lenses , decodeUtf8 - , decode + , codec + + -- * Codecs + , utf8 + , utf16_le + , utf16_be + , utf32_le + , utf32_be + + -- * Other Decoding/Encoding Functions + , decodeIso8859_1 + , decodeAscii + , encodeIso8859_1 + , encodeAscii -- * FreeT Splitters , chunksOf @@ -170,6 +185,7 @@ import Data.Text.Lazy.Internal (foldrChunks, defaultChunkSize) import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) import Data.ByteString (ByteString) import qualified Data.ByteString as B +import qualified Data.ByteString.Char8 as B8 import Data.Char (ord, isSpace) import Data.Functor.Constant (Constant(Constant, getConstant)) import Data.Functor.Identity (Identity) @@ -181,7 +197,7 @@ import qualified GHC.IO.Exception as G import Pipes import qualified Pipes.ByteString as PB import qualified Pipes.Text.Internal as PE -import Pipes.Text.Codec (Codec(..)) +import Pipes.Text.Codec import Pipes.Core (respond, Server') import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) import qualified Pipes.Group as PG @@ -1073,43 +1089,114 @@ unwords = intercalate (yield $ T.singleton ' ') @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. -} - - -decode :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) --- decode codec = go B.empty where --- go extra p0 = --- do x <- lift (next p0) --- case x of Right (chunk, p) -> --- do let (text, stuff) = codecDecode codec (B.append extra chunk) --- yield text --- case stuff of Right extra' -> go extra' p --- Left (exc,bs) -> do yield text --- return (do yield bs --- p) --- Left r -> return (do yield extra --- return r) - -decode d p0 = case d of - PE.Other txt bad -> do yield txt - return (do yield bad - p0) - PE.Some txt extra dec -> do yield txt - x <- lift (next p0) - case x of Left r -> return (do yield extra - return r) - Right (chunk,p1) -> decode (dec chunk) p1 - --- go !carry dec0 p = do --- x <- lift (next p) --- case x of Left r -> if B.null carry --- then return (return r) -- all bytestrinput was consumed --- else return (do yield carry -- a potentially valid fragment remains --- return r) --- --- Right (chunk, p') -> case dec0 chunk of --- PE.Some text carry2 dec -> do yield text --- go carry2 dec p' --- PE.Other text bs -> do yield text --- return (do yield bs -- an invalid blob remains --- p') --- {-# INLINABLE decodeUtf8 #-} \ No newline at end of file +codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) +codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) + (k (decoder (dec B.empty) p0) ) where + decoder :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) + decoder !d p0 = case d of + PE.Other txt bad -> do yield txt + return (do yield bad + p0) + PE.Some txt extra dec -> do yield txt + x <- lift (next p0) + case x of Left r -> return (do yield extra + return r) + Right (chunk,p1) -> decoder (dec chunk) p1 + +-- decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) +-- (k (go B.empty PE.streamDecodeUtf8 p0)) where + +encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeAscii = go where + go p = do echunk <- lift (next p) + case echunk of + Left r -> return (return r) + Right (chunk, p') -> + if T.null chunk + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeIso8859_1 = go where + go p = do etxt <- lift (next p) + case etxt of + Left r -> return (return r) + Right (txt, p') -> + if T.null txt + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeAscii = go where + go p = do echunk <- lift (next p) + case echunk of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else let (safe, unsafe) = B.span (<= 0x7F) chunk + in do yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return $ do yield unsafe + p' + + +decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeIso8859_1 = go where + go p = do echunk <- lift (next p) + case echunk of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else let (safe, unsafe) = B.span (<= 0xFF) chunk + in do yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return $ do yield unsafe + p' + + + +{- + ascii :: Codec + ascii = Codec name enc (toDecoding dec) where + name = T.pack "ASCII" + enc text = (bytes, extra) where + (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text + bytes = B8.pack (T.unpack safe) + extra = if T.null unsafe + then Nothing + else Just (EncodeException ascii (T.head unsafe), unsafe) + + dec bytes = (text, extra) where + (safe, unsafe) = B.span (<= 0x7F) bytes + text = T.pack (B8.unpack safe) + extra = if B.null unsafe + then Right B.empty + else Left (DecodeException ascii (B.head unsafe), unsafe) + + iso8859_1 :: Codec + iso8859_1 = Codec name enc (toDecoding dec) where + name = T.pack "ISO-8859-1" + enc text = (bytes, extra) where + (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text + bytes = B8.pack (T.unpack safe) + extra = if T.null unsafe + then Nothing + else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) + + dec bytes = (T.pack (B8.unpack bytes), Right B.empty) +-} + \ No newline at end of file diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs index e4357b9..070b0d9 100644 --- a/Pipes/Text/Codec.hs +++ b/Pipes/Text/Codec.hs @@ -1,11 +1,9 @@ -{-# LANGUAGE DeriveDataTypeable, RankNTypes #-} +{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} -- | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin -- License: MIT -- --- Handle streams of text. --- -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. module Pipes.Text.Codec @@ -15,6 +13,10 @@ module Pipes.Text.Codec , Codec(..) , TextException(..) , utf8 + , utf16_le + , utf16_be + , utf32_le + , utf32_be ) where import Data.Bits ((.&.)) @@ -37,7 +39,7 @@ import Data.Typeable import Control.Arrow (first) import Data.Maybe (catMaybes) import Pipes.Text.Internal - +import Pipes -- | A specific character encoding. -- -- Since 0.3.0 @@ -62,10 +64,12 @@ instance Exc.Exception TextException toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) -> (ByteString -> Decoding) toDecoding op = loop B.empty where - loop extra bs0 = case op (B.append extra bs0) of - (txt, Right bs) -> Some txt bs (loop bs) - (txt, Left (_,bs)) -> Other txt bs - + loop !extra bs0 = case op (B.append extra bs0) of + (txt, Right bs) -> Some txt bs (loop bs) + (txt, Left (_,bs)) -> Other txt bs +-- To do: toDecoding should be inlined in each of the 'Codec' definitions +-- or else Codec changed to the conduit/enumerator definition. We have +-- altered it to use 'streamDecodeUtf8' splitSlowly :: (ByteString -> Text) -> ByteString @@ -87,8 +91,7 @@ utf8 :: Codec utf8 = Codec name enc (toDecoding dec) where name = T.pack "UTF-8" enc text = (TE.encodeUtf8 text, Nothing) - dec bytes = case decodeSomeUtf8 bytes of - (t,b) -> (t, Right b) + dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) -- -- Whether the given byte is a continuation byte. -- isContinuation byte = byte .&. 0xC0 == 0x80 @@ -202,34 +205,6 @@ utf32SplitBytes dec bytes = split where then (bytes, B.empty) else B.splitAt lenToDecode bytes -ascii :: Codec -ascii = Codec name enc (toDecoding dec) where - name = T.pack "ASCII" - enc text = (bytes, extra) where - (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text - bytes = B8.pack (T.unpack safe) - extra = if T.null unsafe - then Nothing - else Just (EncodeException ascii (T.head unsafe), unsafe) - - dec bytes = (text, extra) where - (safe, unsafe) = B.span (<= 0x7F) bytes - text = T.pack (B8.unpack safe) - extra = if B.null unsafe - then Right B.empty - else Left (DecodeException ascii (B.head unsafe), unsafe) - -iso8859_1 :: Codec -iso8859_1 = Codec name enc (toDecoding dec) where - name = T.pack "ISO-8859-1" - enc text = (bytes, extra) where - (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text - bytes = B8.pack (T.unpack safe) - extra = if T.null unsafe - then Nothing - else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) - - dec bytes = (T.pack (B8.unpack bytes), Right B.empty) tryEvaluate :: a -> Either Exc.SomeException a tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate