1 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses.
7 module Pipes.Text.Encoding
9 -- * The Lens or Codec type
12 -- * Viewing the Text in a ByteString
21 -- * Non-lens decoding functions
29 -- * Re-encoding functions
36 -- * Functions for latin and ascii text
45 import Data.Functor.Constant (Constant(..))
46 import Data.Char (ord)
47 import Data.ByteString as B
48 import Data.ByteString (ByteString)
49 import Data.ByteString.Char8 as B8
50 import Data.Text (Text)
51 import qualified Data.Text as T
52 import qualified Data.Text.Encoding as TE
53 import Data.Text.StreamDecoding
54 import Control.Monad (join)
55 import Data.Word (Word8)
59 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
62 The 'Codec' type is a simple specializion of
63 the @Lens'@ type synonymn used by the standard lens libraries,
64 <http://hackage.haskell.org/package/lens lens> and
65 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
67 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
69 is just an alias for an ordinary Prelude type. Thus you use any codec with
70 the @view@ / @(^.)@ and @zoom@ functions from those libraries.
77 => Lens' (Producer ByteString m r)
78 (Producer Text m (Producer ByteString m r))
80 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
81 exported here for convience
83 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf
87 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
88 decode codec a = getConstant (codec Constant a)
91 decodeStream :: Monad m
92 => (B.ByteString -> DecodeResult)
93 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
94 decodeStream = loop where
97 case x of Left r -> return (return r)
98 Right (chunk, p') -> case dec0 chunk of
99 DecodeResultSuccess text dec -> do yield text
101 DecodeResultFailure text bs -> do yield text
104 {-# INLINABLE decodeStream#-}
107 These are functions with the simple type:
109 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
113 > decodeUtf8 = view utf8
114 > decodeUtf16LE = view utf16LE
116 and so forth, but these forms
117 may be more convenient (and give better type errors!) where lenses are
122 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
123 decodeUtf8 = decodeStream streamUtf8
124 {-# INLINE decodeUtf8 #-}
126 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
127 decodeUtf8Pure = decodeStream streamUtf8Pure
128 {-# INLINE decodeUtf8Pure #-}
130 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
131 decodeUtf16LE = decodeStream streamUtf16LE
132 {-# INLINE decodeUtf16LE #-}
134 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
135 decodeUtf16BE = decodeStream streamUtf16BE
136 {-# INLINE decodeUtf16BE #-}
138 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
139 decodeUtf32LE = decodeStream streamUtf32LE
140 {-# INLINE decodeUtf32LE #-}
142 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
143 decodeUtf32BE = decodeStream streamUtf32BE
144 {-# INLINE decodeUtf32BE #-}
148 These are simply defined
150 > encodeUtf8 = yield . TE.encodeUtf8
152 They are intended for use with 'for'
154 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
156 which would have the effect of
158 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
160 using the encoding functions from Data.Text.Encoding
163 encodeUtf8 :: Monad m => Text -> Producer ByteString m ()
164 encodeUtf8 = yield . TE.encodeUtf8
165 encodeUtf16LE :: Monad m => Text -> Producer ByteString m ()
166 encodeUtf16LE = yield . TE.encodeUtf16LE
167 encodeUtf16BE :: Monad m => Text -> Producer ByteString m ()
168 encodeUtf16BE = yield . TE.encodeUtf16BE
169 encodeUtf32LE :: Monad m => Text -> Producer ByteString m ()
170 encodeUtf32LE = yield . TE.encodeUtf32LE
171 encodeUtf32BE :: Monad m => Text -> Producer ByteString m ()
172 encodeUtf32BE = yield . TE.encodeUtf32BE
174 mkCodec :: (forall r m . Monad m =>
175 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
176 -> (Text -> ByteString)
178 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
183 Each codec/lens looks into a byte stream that is supposed to contain text.
184 The particular \'Codec\' lenses are named in accordance with the expected
185 encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' --
186 turns a Codec into a function:
188 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
189 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
190 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
192 Uses of a codec with @view@ / @(^.)@ / 'decode' can always be replaced by the specialized
193 decoding functions exported here, e.g.
195 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
196 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
198 The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head.
199 At any point of decoding failure, the stream of text ends and reverts to (returns)
200 the original byte stream. Thus if the first bytes are already
201 un-decodable, the whole ByteString producer will be returned, i.e.
203 > view utf8 bytestream
205 will just come to the same as
209 Where there is no decoding failure, the return value of the text stream will be
210 an empty byte stream followed by its own return value. In all cases you must
211 deal with the fact that it is a ByteString producer that is returned, even if
212 it can be thrown away with @Control.Monad.void@
214 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
216 @zoom@ converts a Text parser into a ByteString parser:
218 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
220 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
221 > withNextByte = do char_ <- zoom utf8 Text.drawChar
222 > byte_ <- Bytes.peekByte
223 > return (char_, byte_)
225 @withNextByte@ will return the first valid Char in a ByteString,
226 and the first byte of the next character, if they exists. Because
227 we \'draw\' one and \'peek\' at the other, the parser as a whole only
228 advances one Char's length along the bytestring.
233 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
236 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
239 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
242 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
245 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
248 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
252 ascii and latin encodings only use a small number of the characters 'Text'
253 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
254 Rather we simply define functions each way.
258 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
259 -- returning the rest of the 'Text' at the first non-ascii 'Char'
261 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
262 encodeAscii = go where
263 go p = do e <- lift (next p)
265 Left r -> return (return r)
269 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
270 in do yield (B8.pack (T.unpack safe))
273 else return $ do yield unsafe
276 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
277 returning the rest of the 'Text' upon hitting any non-latin 'Char'
279 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
280 encodeIso8859_1 = go where
281 go p = do e <- lift (next p)
283 Left r -> return (return r)
287 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
288 in do yield (B8.pack (T.unpack safe))
291 else return $ do yield unsafe
294 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
295 unused 'ByteString' upon hitting an un-ascii byte.
297 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
298 decodeAscii = go where
299 go p = do e <- lift (next p)
301 Left r -> return (return r)
305 else let (safe, unsafe) = B.span (<= 0x7F) chunk
306 in do yield (T.pack (B8.unpack safe))
309 else return (do yield unsafe
312 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
313 unused 'ByteString' upon hitting the rare un-latinizable byte.
315 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
316 decodeIso8859_1 = go where
317 go p = do e <- lift (next p)
319 Left r -> return (return r)
323 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
324 yield (T.pack (B8.unpack safe))
327 else return (do yield unsafe