X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;ds=sidebyside;f=Pipes%2FText%2FEncoding.hs;h=e24241132b3ae13f43a608922a461e8287386617;hb=bdc47ebc7bd24c7b123867072c825d42d26ca536;hp=e6fc6bfdabcf53fd077dc58262ac662afd965e75;hpb=02f89dfe9b4787fbad5f3740ed1626203c474a2b;p=github%2Ffretlink%2Ftext-pipes.git diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index e6fc6bf..e242411 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs @@ -1,17 +1,25 @@ {-# LANGUAGE RankNTypes, BangPatterns #-} --- | This module uses the stream decoding functions from Michael Snoyman's new --- --- package to define decoding functions and lenses. +-- | This module uses the stream decoding functions from +-- +-- package to define decoding functions and lenses. The exported names +-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ module Pipes.Text.Encoding ( - -- * The Lens or Codec type + -- * Decoding ByteStrings and Encoding Texts + -- ** Simple usage + -- $usage + + -- ** Lens usage -- $lenses + + + -- * Basic lens operations Codec - -- * Viewing the Text in a ByteString - -- $codecs , decode + , eof + -- * Decoding lenses , utf8 , utf8Pure , utf16LE @@ -45,48 +53,237 @@ module Pipes.Text.Encoding import Data.Functor.Constant (Constant(..)) import Data.Char (ord) import Data.ByteString as B -import Data.ByteString (ByteString) import Data.ByteString.Char8 as B8 import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as TE -import Data.Text.StreamDecoding -import Control.Monad (join) -import Data.Word (Word8) +import qualified Data.Streaming.Text as Stream +import Data.Streaming.Text (DecodeResult(..)) +import Control.Monad (join, liftM) import Pipes -type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) -{- $lenses - The 'Codec' type is a simple specializion of - the @Lens'@ type synonymn used by the standard lens libraries, - and - . That type, +{- $usage + Encoding is of course simple. Given + +> text :: Producer Text IO () + + we can encode it with @Data.Text.Encoding.encodeUtf8@ + +> TE.encodeUtf8 :: Text -> ByteString + + and ordinary pipe operations: + +> text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO () + + or, equivalently + +> for text (yield . TE.encodeUtf8) + + But, using this module, we might use + +> encodeUtf8 :: Text -> Producer ByteString m () + + to write + +> for text encodeUtf8 :: Producer.ByteString IO () + + All of the above come to the same. + + + Given + +> bytes :: Producer ByteString IO () + + we can apply a decoding function from this module: + +> decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ()) + + The Text producer ends wherever decoding first fails. The un-decoded + material is returned. If we are confident it is of no interest, we can + write: + +> void $ decodeUtf8 bytes :: Producer Text IO () + + Thus we can re-encode + as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g. + +> for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ()) + The bytestring producer that is returned begins with where utf16BE decoding + failed; if it didn't fail the producer is empty. + +-} + +{- $lenses + We get a bit more flexibility, particularly in the use of pipes-style "parsers", + if we use a lens like @utf8@ or @utf16BE@ + that focusses on the text in an appropriately encoded byte stream. + > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) - is just an alias for an ordinary Prelude type. Thus you use any codec with - the @view@ / @(^.)@ and @zoom@ functions from those libraries. + is just an alias for a Prelude type. We abbreviate this further, for our use case, as +> type Codec +> = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) + + and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can + re-encode what they have decoded. Thus you use any particular codec with + the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries; + , + , + , or one of the + and packages will all work + the same, since we already have access to the types they require. + + Each decoding lens looks into a byte stream that is supposed to contain text. + The particular lenses are named in accordance with the expected + encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@ + into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode': + +> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) +> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) +> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) + + Of course, we could always do this with the specialized decoding functions, e.g. + +> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) +> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) + + As with these functions, the stream of text that a @Codec@ \'sees\' + in the stream of bytes begins at its head. + At any point of decoding failure, the stream of text ends and reverts to (returns) + the original byte stream. Thus if the first bytes are already + un-decodable, the whole ByteString producer will be returned, i.e. + +> view utf8 bad_bytestream + + will just come to the same as + +> return bad_bytestream + + Where there is no decoding failure, the return value of the text stream will be + an empty byte stream followed by its own return value. In all cases you must + deal with the fact that it is a /ByteString producer/ that is returned, even if + it can be thrown away with @Control.Monad.void@ + +> void (Bytes.stdin ^. utf8) :: Producer Text IO () + + The @eof@ lens permits you to pattern match: if there is a Right value, + it is the leftover bytestring producer, if there is a Right value, it + is the return value of the original bytestring producer: + +> Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ()) + + Thus for the stream of un-decodable bytes mentioned above, + +> view (utf8 . eof) bad_bytestream + + will be the same as + +> return (Left bad_bytestream) + + @zoom utf8@ converts a Text parser into a ByteString parser: + +> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) + + or, using the type synonymn from @Pipes.Parse@: + +> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) + + Thus we can define a ByteString parser (in the pipes-parse sense) like this: + +> charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8))) +> charPlusByte = do char_ <- zoom utf8 Text.drawChar +> byte_ <- Bytes.peekByte +> return (char_, byte_) + + Though @charPlusByte@ is partly defined with a Text parser 'drawChar'; + but it is a ByteString parser; it will return the first valid utf8-encoded + Char in a ByteString, /whatever its byte-length/, + and the first byte following, if both exist. Because + we \'draw\' one and \'peek\' at the other, the parser as a whole only + advances one Char's length along the bytestring, whatever that length may be. + See the slightly more complex example \'decode.hs\' in the + + discussion of this type of byte stream parsing. -} +type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) + type Codec = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) - -{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; - exported here for convience -> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf + +{- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries; + exported here under a name appropriate to the material. Thus + +> decode utf8 bytes :: Producer Text IO (Producer ByteString IO ()) + + All of these are thus the same: + +> decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes + -} decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b decode codec a = getConstant (codec Constant a) +{- | @eof@ tells you explicitly when decoding stops due to bad bytes or + instead reaches end-of-file happily. (Without it one just makes an explicit + test for emptiness of the resulting bytestring production using next) Thus + +> decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ()) + + If we hit undecodable bytes, the remaining bytestring producer will be + returned as a Left value; in the happy case, a Right value is returned + with the anticipated return value for the original bytestring producer. + + Again, all of these are the same + +> decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof + +-} + +eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r)) + (t m (Either (Producer ByteString m r) r)) +eof k p0 = fmap fromEither (k (toEither p0)) where + + fromEither = liftM (either id return) + + toEither pp = do p <- pp + check p + + check p = do e <- lift (next p) + case e of + Left r -> return (Right r) + Right (bs,pb) -> if B.null bs + then check pb + else return (Left (do yield bs + pb)) + +utf8 :: Codec +utf8 = mkCodec decodeUtf8 TE.encodeUtf8 + +utf8Pure :: Codec +utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 + +utf16LE :: Codec +utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE + +utf16BE :: Codec +utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE + +utf32LE :: Codec +utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE + +utf32BE :: Codec +utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE decodeStream :: Monad m => (B.ByteString -> DecodeResult) @@ -94,15 +291,17 @@ decodeStream :: Monad m decodeStream = loop where loop dec0 p = do x <- lift (next p) - case x of Left r -> return (return r) - Right (chunk, p') -> case dec0 chunk of - DecodeResultSuccess text dec -> do yield text - loop dec p' - DecodeResultFailure text bs -> do yield text - return (do yield bs - p') + case x of + Left r -> return (return r) + Right (chunk, p') -> case dec0 chunk of + DecodeResultSuccess text dec -> do yield text + loop dec p' + DecodeResultFailure text bs -> do yield text + return (do yield bs + p') {-# INLINABLE decodeStream#-} + {- $decoders These are functions with the simple type: @@ -120,27 +319,27 @@ decodeStream = loop where decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8 = decodeStream streamUtf8 +decodeUtf8 = decodeStream Stream.decodeUtf8 {-# INLINE decodeUtf8 #-} decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8Pure = decodeStream streamUtf8Pure +decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure {-# INLINE decodeUtf8Pure #-} decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf16LE = decodeStream streamUtf16LE +decodeUtf16LE = decodeStream Stream.decodeUtf16LE {-# INLINE decodeUtf16LE #-} decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf16BE = decodeStream streamUtf16BE +decodeUtf16BE = decodeStream Stream.decodeUtf16BE {-# INLINE decodeUtf16BE #-} decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf32LE = decodeStream streamUtf32LE +decodeUtf32LE = decodeStream Stream.decodeUtf32LE {-# INLINE decodeUtf32LE #-} decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf32BE = decodeStream streamUtf32BE +decodeUtf32BE = decodeStream Stream.decodeUtf32BE {-# INLINE decodeUtf32BE #-} @@ -160,15 +359,15 @@ decodeUtf32BE = decodeStream streamUtf32BE using the encoding functions from Data.Text.Encoding -} -encodeUtf8 :: Monad m => Text -> Producer ByteString m () +encodeUtf8 :: Monad m => Text -> Producer' ByteString m () encodeUtf8 = yield . TE.encodeUtf8 -encodeUtf16LE :: Monad m => Text -> Producer ByteString m () +encodeUtf16LE :: Monad m => Text -> Producer' ByteString m () encodeUtf16LE = yield . TE.encodeUtf16LE -encodeUtf16BE :: Monad m => Text -> Producer ByteString m () +encodeUtf16BE :: Monad m => Text -> Producer' ByteString m () encodeUtf16BE = yield . TE.encodeUtf16BE -encodeUtf32LE :: Monad m => Text -> Producer ByteString m () +encodeUtf32LE :: Monad m => Text -> Producer' ByteString m () encodeUtf32LE = yield . TE.encodeUtf32LE -encodeUtf32BE :: Monad m => Text -> Producer ByteString m () +encodeUtf32BE :: Monad m => Text -> Producer' ByteString m () encodeUtf32BE = yield . TE.encodeUtf32BE mkCodec :: (forall r m . Monad m => @@ -178,75 +377,6 @@ mkCodec :: (forall r m . Monad m => mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) -{- $codecs - - Each codec/lens looks into a byte stream that is supposed to contain text. - The particular \'Codec\' lenses are named in accordance with the expected - encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' -- - turns a Codec into a function: - -> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) -> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) -> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) - - Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized - decoding functions exported here, e.g. - -> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) -> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) - - The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head. - At any point of decoding failure, the stream of text ends and reverts to (returns) - the original byte stream. Thus if the first bytes are already - un-decodable, the whole ByteString producer will be returned, i.e. - -> view utf8 bytestream - - will just come to the same as - -> return bytestream - - Where there is no decoding failure, the return value of the text stream will be - an empty byte stream followed by its own return value. In all cases you must - deal with the fact that it is a ByteString producer that is returned, even if - it can be thrown away with @Control.Monad.void@ - -> void (Bytes.stdin ^. utf8) :: Producer Text IO () - - @zoom@ converts a Text parser into a ByteString parser: - -> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) -> -> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) -> withNextByte = do char_ <- zoom utf8 Text.drawChar -> byte_ <- Bytes.peekByte -> return (char_, byte_) - - @withNextByte@ will return the first valid Char in a ByteString, - and the first byte of the next character, if they exists. Because - we \'draw\' one and \'peek\' at the other, the parser as a whole only - advances one Char's length along the bytestring. - - -} - -utf8 :: Codec -utf8 = mkCodec decodeUtf8 TE.encodeUtf8 - -utf8Pure :: Codec -utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 - -utf16LE :: Codec -utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE - -utf16BE :: Codec -utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE - -utf32LE :: Codec -utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE - -utf32BE :: Codec -utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE - {- $ascii ascii and latin encodings only use a small number of the characters 'Text'