1 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- | This module uses the stream decoding functions from
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
8 module Pipes.Text.Encoding
10 -- * The Lens or Codec type
15 -- * \'Viewing\' the Text in a byte stream
23 -- * Non-lens decoding functions
31 -- * Re-encoding functions
38 -- * Functions for latin and ascii text
47 import Data.Functor.Constant (Constant(..))
48 import Data.Char (ord)
49 import Data.ByteString as B
50 import Data.ByteString (ByteString)
51 import Data.ByteString.Char8 as B8
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import qualified Data.Text.Encoding as TE
55 import qualified Data.Streaming.Text as Stream
56 import Data.Streaming.Text (DecodeResult(..))
57 import Control.Monad (join, liftM)
58 import Data.Word (Word8)
61 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
64 The 'Codec' type is a simple specializion of
65 the @Lens'@ type synonymn used by the standard lens libraries,
66 <http://hackage.haskell.org/package/lens lens> and
67 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
69 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
71 is just an alias for a Prelude type. Thus you use any particular codec with
72 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
73 we presuppose neither library since we already have access to the types they require.
79 => Lens' (Producer ByteString m r)
80 (Producer Text m (Producer ByteString m r))
82 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
83 exported here under a name appropriate to the material. All of these are
86 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
91 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
92 decode codec a = getConstant (codec Constant a)
94 {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
95 reaches end-of-file happily. (Without it one just makes an explicit test
96 for emptiness of the resulting bytestring production using 'next')
99 > decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof
101 will be a text producer. If we hit undecodable bytes, the remaining
102 bytestring producer will be returned as a 'Left' value;
103 in the happy case, a 'Right' value is returned with the anticipated
104 return value for the original bytestring producer.
109 eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
110 (Producer Text m (Either (Producer ByteString m r) r))
111 eof k p = fmap fromEither (k (toEither p)) where
113 fromEither = liftM (either id return)
115 toEither pp = do p <- pp
118 check p = do e <- lift (next p)
120 Left r -> return (Right r)
121 Right (bs,pb) -> if B.null bs
123 else return (Left (do yield bs
129 Each Codec-lens looks into a byte stream that is supposed to contain text.
130 The particular \'Codec\' lenses are named in accordance with the expected
131 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
132 use @view@ / @(^.)@ -- here also called 'decode':
134 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
135 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
136 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
138 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
139 decoding functions exported here, e.g.
141 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
142 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
144 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
145 At any point of decoding failure, the stream of text ends and reverts to (returns)
146 the original byte stream. Thus if the first bytes are already
147 un-decodable, the whole ByteString producer will be returned, i.e.
149 > view utf8 bytestream
151 will just come to the same as
155 Where there is no decoding failure, the return value of the text stream will be
156 an empty byte stream followed by its own return value. In all cases you must
157 deal with the fact that it is a /ByteString producer/ that is returned, even if
158 it can be thrown away with @Control.Monad.void@
160 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
162 @zoom@ converts a Text parser into a ByteString parser:
164 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
166 or, using the type synonymn from @Pipes.Parse@:
168 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
170 Thus we can define a ByteString parser like this:
172 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
173 > withNextByte = do char_ <- zoom utf8 Text.drawChar
174 > byte_ <- Bytes.peekByte
175 > return (char_, byte_)
177 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
178 but it is a ByteString parser; it will return the first valid utf8-encoded
179 Char in a ByteString, whatever its length,
180 and the first byte of the next character, if they exist. Because
181 we \'draw\' one and \'peek\' at the other, the parser as a whole only
182 advances one Char's length along the bytestring, whatever that length may be.
183 See the slightly more complex example \'decode.hs\' in the
184 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
185 discussion of this type of byte stream parsing.
189 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
192 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
195 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
198 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
201 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
204 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
206 decodeStream :: Monad m
207 => (B.ByteString -> DecodeResult)
208 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
209 decodeStream = loop where
211 do x <- lift (next p)
212 case x of Left r -> return (return r)
213 Right (chunk, p') -> case dec0 chunk of
214 DecodeResultSuccess text dec -> do yield text
216 DecodeResultFailure text bs -> do yield text
219 {-# INLINABLE decodeStream#-}
223 These are functions with the simple type:
225 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
229 > decodeUtf8 = view utf8
230 > decodeUtf16LE = view utf16LE
232 and so forth, but these forms
233 may be more convenient (and give better type errors!) where lenses are
238 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
239 decodeUtf8 = decodeStream Stream.decodeUtf8
240 {-# INLINE decodeUtf8 #-}
242 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
243 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
244 {-# INLINE decodeUtf8Pure #-}
246 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
247 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
248 {-# INLINE decodeUtf16LE #-}
250 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
251 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
252 {-# INLINE decodeUtf16BE #-}
254 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
255 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
256 {-# INLINE decodeUtf32LE #-}
258 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
259 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
260 {-# INLINE decodeUtf32BE #-}
264 These are simply defined
266 > encodeUtf8 = yield . TE.encodeUtf8
268 They are intended for use with 'for'
270 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
272 which would have the effect of
274 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
276 using the encoding functions from Data.Text.Encoding
279 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
280 encodeUtf8 = yield . TE.encodeUtf8
281 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
282 encodeUtf16LE = yield . TE.encodeUtf16LE
283 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
284 encodeUtf16BE = yield . TE.encodeUtf16BE
285 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
286 encodeUtf32LE = yield . TE.encodeUtf32LE
287 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
288 encodeUtf32BE = yield . TE.encodeUtf32BE
290 mkCodec :: (forall r m . Monad m =>
291 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
292 -> (Text -> ByteString)
294 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
299 ascii and latin encodings only use a small number of the characters 'Text'
300 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
301 Rather we simply define functions each way.
305 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
306 -- returning the rest of the 'Text' at the first non-ascii 'Char'
308 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
309 encodeAscii = go where
310 go p = do e <- lift (next p)
312 Left r -> return (return r)
316 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
317 in do yield (B8.pack (T.unpack safe))
320 else return $ do yield unsafe
323 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
324 returning the rest of the 'Text' upon hitting any non-latin 'Char'
326 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
327 encodeIso8859_1 = go where
328 go p = do e <- lift (next p)
330 Left r -> return (return r)
334 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
335 in do yield (B8.pack (T.unpack safe))
338 else return $ do yield unsafe
341 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
342 unused 'ByteString' upon hitting an un-ascii byte.
344 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
345 decodeAscii = go where
346 go p = do e <- lift (next p)
348 Left r -> return (return r)
352 else let (safe, unsafe) = B.span (<= 0x7F) chunk
353 in do yield (T.pack (B8.unpack safe))
356 else return (do yield unsafe
359 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
360 unused 'ByteString' upon hitting the rare un-latinizable byte.
362 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
363 decodeIso8859_1 = go where
364 go p = do e <- lift (next p)
366 Left r -> return (return r)
370 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
371 yield (T.pack (B8.unpack safe))
374 else return (do yield unsafe