1 {-# LANGUAGE RankNTypes, BangPatterns #-}
4 -- This module uses the stream decoding functions from the text-stream-decoding package
5 -- to define decoding functions and lenses.
7 module Pipes.Text.Encoding
12 -- * Standard lenses for viewing Text in ByteString
20 -- * Non-lens decoding functions
27 -- * Functions for latin and ascii text
36 import Data.Char (ord)
37 import Data.ByteString as B
38 import Data.ByteString (ByteString)
39 import Data.ByteString.Char8 as B8
40 import Data.Text (Text)
41 import qualified Data.Text as T
42 import qualified Data.Text.Encoding as TE
43 import Data.Text.StreamDecoding
44 import Control.Monad (join)
45 import Data.Word (Word8)
49 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
52 The 'Codec' type is just an aliased standard Prelude type. It just specializes
53 the @Lens'@ type synonymn used by the standard lens libraries, @lens@ and
54 @lens-families@ . You use them with
55 the @view@ or @(^.)@ and @zoom@ functions from those libraries.
57 Each codec lens looks into a byte stream that is understood to contain text.
58 The stream of text it 'sees' in the stream of bytes begins at its head; it ends
59 by reverting to (returning) the original byte stream
60 beginning at the point of decoding failure. Where there is no decoding failure,
61 it returns an empty byte stream with its return value.
67 => Lens' (Producer ByteString m r)
68 (Producer Text m (Producer ByteString m r))
70 decodeStream :: Monad m
71 => (B.ByteString -> DecodeResult)
72 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73 decodeStream = loop where
76 case x of Left r -> return (return r)
77 Right (chunk, p') -> case dec0 chunk of
78 DecodeResultSuccess text dec -> do yield text
80 DecodeResultFailure text bs -> do yield text
83 {-# INLINABLE decodeStream#-}
88 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89 decodeUtf8 = decodeStream streamUtf8
90 {-# INLINE decodeUtf8 #-}
92 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93 decodeUtf8Pure = decodeStream streamUtf8Pure
94 {-# INLINE decodeUtf8Pure #-}
96 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
97 decodeUtf16LE = decodeStream streamUtf16LE
98 {-# INLINE decodeUtf16LE #-}
100 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
101 decodeUtf16BE = decodeStream streamUtf16BE
102 {-# INLINE decodeUtf16BE #-}
104 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
105 decodeUtf32LE = decodeStream streamUtf32LE
106 {-# INLINE decodeUtf32LE #-}
108 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
109 decodeUtf32BE = decodeStream streamUtf32BE
110 {-# INLINE decodeUtf32BE #-}
112 mkCodec :: (forall r m . Monad m =>
113 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
114 -> (Text -> ByteString)
116 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
121 The particular \'Codec\' lenses are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
123 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
124 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
126 @zoom@ converts a Text parser into a ByteString parser:
128 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
130 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
131 > withNextByte = do char_ <- zoom utf8 Text.drawChar
132 > byte_ <- Bytes.peekByte
133 > return (char_, byte_)
135 @withNextByte@ will return the first valid Char in a ByteString,
136 and the first byte of the next character, if they exists. Because
137 we \'draw\' one and \'peek\' at the other, the parser as a whole only
138 advances one Char's length along the bytestring.
143 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
146 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
149 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
152 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
155 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
158 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
162 ascii and latin encodings only use a small number of the characters 'Text'
163 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
164 Rather we simply define functions each way.
168 -- 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
169 -- returning the rest of the 'Text' at the first non-ascii 'Char'
171 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
172 encodeAscii = go where
173 go p = do e <- lift (next p)
175 Left r -> return (return r)
179 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
180 in do yield (B8.pack (T.unpack safe))
183 else return $ do yield unsafe
186 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
187 returning the rest of the 'Text' upon hitting any non-latin 'Char'
189 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
190 encodeIso8859_1 = go where
191 go p = do e <- lift (next p)
193 Left r -> return (return r)
197 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
198 in do yield (B8.pack (T.unpack safe))
201 else return $ do yield unsafe
204 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
205 unused 'ByteString' upon hitting an un-ascii byte.
207 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
208 decodeAscii = go where
209 go p = do e <- lift (next p)
211 Left r -> return (return r)
215 else let (safe, unsafe) = B.span (<= 0x7F) chunk
216 in do yield (T.pack (B8.unpack safe))
219 else return (do yield unsafe
222 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
223 unused 'ByteString' upon hitting the rare un-latinizable byte.
225 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
226 decodeIso8859_1 = go where
227 go p = do e <- lift (next p)
229 Left r -> return (return r)
233 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
234 yield (T.pack (B8.unpack safe))
237 else return (do yield unsafe