1 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7 -- imported unqualified.
9 module Pipes.Text.Encoding
11 -- * The Lens or Codec type
16 -- * \'Viewing\' the Text in a byte stream
24 -- * Non-lens decoding functions
32 -- * Re-encoding functions
39 -- * Functions for latin and ascii text
48 import Data.Functor.Constant (Constant(..))
49 import Data.Char (ord)
50 import Data.ByteString as B
51 import Data.ByteString (ByteString)
52 import Data.ByteString.Char8 as B8
53 import Data.Text (Text)
54 import qualified Data.Text as T
55 import qualified Data.Text.Encoding as TE
56 import qualified Data.Streaming.Text as Stream
57 import Data.Streaming.Text (DecodeResult(..))
58 import Control.Monad (join)
59 import Data.Word (Word8)
62 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
65 The 'Codec' type is a simple specializion of
66 the @Lens'@ type synonymn used by the standard lens libraries,
67 <http://hackage.haskell.org/package/lens lens> and
68 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
70 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
72 is just an alias for a Prelude type. Thus you use any particular codec with
73 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
74 we presuppose neither since we already have access to the types they require.
81 => Lens' (Producer ByteString m r)
82 (Producer Text m (Producer ByteString m r))
84 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
85 exported here under a name appropriate to the material. All of these are
88 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
93 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
94 decode codec a = getConstant (codec Constant a)
96 {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
97 reaches end-of-file happily. (Without it one just makes an explicit test
98 for emptiness of the resulting bytestring production using 'next')
101 > decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof
103 will be a text producer. If we hit undecodable bytes, the remaining
104 bytestring producer will be returned as a 'Left' value;
105 in the happy case, a 'Right' value is returned with the anticipated
106 return value for the original bytestring producer.
111 eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
112 (Producer Text m (Either (Producer ByteString m r) r))
113 eof k p = fmap fromEither (k (toEither p)) where
115 fromEither = liftM (either id return)
117 toEither pp = do p <- pp
120 check p = do e <- lift (next p)
122 Left r -> return (Right r)
123 Right (bs,pb) -> if B.null bs
125 else return (Left (do yield bs
131 Each Codec-lens looks into a byte stream that is supposed to contain text.
132 The particular \'Codec\' lenses are named in accordance with the expected
133 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
134 use @view@ / @(^.)@ -- here also called 'decode':
136 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
137 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
138 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
140 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
141 decoding functions exported here, e.g.
143 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
144 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
146 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
147 At any point of decoding failure, the stream of text ends and reverts to (returns)
148 the original byte stream. Thus if the first bytes are already
149 un-decodable, the whole ByteString producer will be returned, i.e.
151 > view utf8 bytestream
153 will just come to the same as
157 Where there is no decoding failure, the return value of the text stream will be
158 an empty byte stream followed by its own return value. In all cases you must
159 deal with the fact that it is a /ByteString producer/ that is returned, even if
160 it can be thrown away with @Control.Monad.void@
162 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
164 @zoom@ converts a Text parser into a ByteString parser:
166 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
168 or, using the type synonymn from @Pipes.Parse@:
170 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
172 Thus we can define a ByteString parser like this:
174 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
175 > withNextByte = do char_ <- zoom utf8 Text.drawChar
176 > byte_ <- Bytes.peekByte
177 > return (char_, byte_)
179 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
180 but it is a ByteString parser; it will return the first valid utf8-encoded
181 Char in a ByteString, whatever its length,
182 and the first byte of the next character, if they exist. Because
183 we \'draw\' one and \'peek\' at the other, the parser as a whole only
184 advances one Char's length along the bytestring, whatever that length may be.
185 See the slightly more complex example \'decode.hs\' in the
186 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
187 discussion of this type of byte stream parsing.
191 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
194 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
197 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
200 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
203 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
206 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
208 decodeStream :: Monad m
209 => (B.ByteString -> DecodeResult)
210 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
211 decodeStream = loop where
213 do x <- lift (next p)
214 case x of Left r -> return (return r)
215 Right (chunk, p') -> case dec0 chunk of
216 DecodeResultSuccess text dec -> do yield text
218 DecodeResultFailure text bs -> do yield text
221 {-# INLINABLE decodeStream#-}
225 These are functions with the simple type:
227 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
231 > decodeUtf8 = view utf8
232 > decodeUtf16LE = view utf16LE
234 and so forth, but these forms
235 may be more convenient (and give better type errors!) where lenses are
240 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
241 decodeUtf8 = decodeStream Stream.decodeUtf8
242 {-# INLINE decodeUtf8 #-}
244 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
245 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
246 {-# INLINE decodeUtf8Pure #-}
248 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
249 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
250 {-# INLINE decodeUtf16LE #-}
252 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
253 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
254 {-# INLINE decodeUtf16BE #-}
256 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
257 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
258 {-# INLINE decodeUtf32LE #-}
260 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
261 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
262 {-# INLINE decodeUtf32BE #-}
266 These are simply defined
268 > encodeUtf8 = yield . TE.encodeUtf8
270 They are intended for use with 'for'
272 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
274 which would have the effect of
276 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
278 using the encoding functions from Data.Text.Encoding
281 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
282 encodeUtf8 = yield . TE.encodeUtf8
283 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
284 encodeUtf16LE = yield . TE.encodeUtf16LE
285 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
286 encodeUtf16BE = yield . TE.encodeUtf16BE
287 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
288 encodeUtf32LE = yield . TE.encodeUtf32LE
289 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
290 encodeUtf32BE = yield . TE.encodeUtf32BE
292 mkCodec :: (forall r m . Monad m =>
293 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
294 -> (Text -> ByteString)
296 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
301 ascii and latin encodings only use a small number of the characters 'Text'
302 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
303 Rather we simply define functions each way.
307 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
308 -- returning the rest of the 'Text' at the first non-ascii 'Char'
310 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
311 encodeAscii = go where
312 go p = do e <- lift (next p)
314 Left r -> return (return r)
318 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
319 in do yield (B8.pack (T.unpack safe))
322 else return $ do yield unsafe
325 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
326 returning the rest of the 'Text' upon hitting any non-latin 'Char'
328 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
329 encodeIso8859_1 = go where
330 go p = do e <- lift (next p)
332 Left r -> return (return r)
336 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
337 in do yield (B8.pack (T.unpack safe))
340 else return $ do yield unsafe
343 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
344 unused 'ByteString' upon hitting an un-ascii byte.
346 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
347 decodeAscii = go where
348 go p = do e <- lift (next p)
350 Left r -> return (return r)
354 else let (safe, unsafe) = B.span (<= 0x7F) chunk
355 in do yield (T.pack (B8.unpack safe))
358 else return (do yield unsafe
361 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
362 unused 'ByteString' upon hitting the rare un-latinizable byte.
364 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
365 decodeIso8859_1 = go where
366 go p = do e <- lift (next p)
368 Left r -> return (return r)
372 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
373 yield (T.pack (B8.unpack safe))
376 else return (do yield unsafe