1 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7 -- imported unqualified.
9 module Pipes.Text.Encoding
11 -- * The Lens or Codec type
15 -- * Viewing the Text in a ByteString
23 -- * Non-lens decoding functions
31 -- * Re-encoding functions
38 -- * Functions for latin and ascii text
47 import Data.Functor.Constant (Constant(..))
48 import Data.Char (ord)
49 import Data.ByteString as B
50 import Data.ByteString (ByteString)
51 import Data.ByteString.Char8 as B8
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import qualified Data.Text.Encoding as TE
55 import Data.Text.StreamDecoding
56 import Control.Monad (join)
57 import Data.Word (Word8)
61 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
64 The 'Codec' type is a simple specializion of
65 the @Lens'@ type synonymn used by the standard lens libraries,
66 <http://hackage.haskell.org/package/lens lens> and
67 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
69 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
71 is just an alias for a Prelude type. Thus you use any particular codec with
72 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
73 we presuppose neither since we already have access to the types they require.
80 => Lens' (Producer ByteString m r)
81 (Producer Text m (Producer ByteString m r))
83 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
84 exported here under a name appropriate to the material. All of these are
87 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
91 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
92 decode codec a = getConstant (codec Constant a)
97 Each Codec-lens looks into a byte stream that is supposed to contain text.
98 The particular \'Codec\' lenses are named in accordance with the expected
99 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
100 use @view@ / @(^.)@ -- here also called 'decode':
102 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
103 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
104 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
106 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
107 decoding functions exported here, e.g.
109 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
110 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
112 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
113 At any point of decoding failure, the stream of text ends and reverts to (returns)
114 the original byte stream. Thus if the first bytes are already
115 un-decodable, the whole ByteString producer will be returned, i.e.
117 > view utf8 bytestream
119 will just come to the same as
123 Where there is no decoding failure, the return value of the text stream will be
124 an empty byte stream followed by its own return value. In all cases you must
125 deal with the fact that it is a /ByteString producer/ that is returned, even if
126 it can be thrown away with @Control.Monad.void@
128 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
130 @zoom@ converts a Text parser into a ByteString parser:
132 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
134 or, with the type synonymn of @Pipes.Parse@:
136 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
138 Thus we can define ByteString like this:
140 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
141 > withNextByte = do char_ <- zoom utf8 Text.drawChar
142 > byte_ <- Bytes.peekByte
143 > return (char_, byte_)
145 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
146 but it is a ByteString parser; it will return the first valid utf8-encoded
147 Char in a ByteString, whatever its length,
148 and the first byte of the next character, if they exist. Because
149 we \'draw\' one and \'peek\' at the other, the parser as a whole only
150 advances one Char's length along the bytestring, whatever that length may be.
151 See the slightly more complex example \'decode.hs\' in the
152 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
153 discussion of this type of byte stream parsing.
157 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
160 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
163 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
166 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
169 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
172 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
174 decodeStream :: Monad m
175 => (B.ByteString -> DecodeResult)
176 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
177 decodeStream = loop where
179 do x <- lift (next p)
180 case x of Left r -> return (return r)
181 Right (chunk, p') -> case dec0 chunk of
182 DecodeResultSuccess text dec -> do yield text
184 DecodeResultFailure text bs -> do yield text
187 {-# INLINABLE decodeStream#-}
190 These are functions with the simple type:
192 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
196 > decodeUtf8 = view utf8
197 > decodeUtf16LE = view utf16LE
199 and so forth, but these forms
200 may be more convenient (and give better type errors!) where lenses are
205 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
206 decodeUtf8 = decodeStream streamUtf8
207 {-# INLINE decodeUtf8 #-}
209 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
210 decodeUtf8Pure = decodeStream streamUtf8Pure
211 {-# INLINE decodeUtf8Pure #-}
213 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
214 decodeUtf16LE = decodeStream streamUtf16LE
215 {-# INLINE decodeUtf16LE #-}
217 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
218 decodeUtf16BE = decodeStream streamUtf16BE
219 {-# INLINE decodeUtf16BE #-}
221 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
222 decodeUtf32LE = decodeStream streamUtf32LE
223 {-# INLINE decodeUtf32LE #-}
225 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
226 decodeUtf32BE = decodeStream streamUtf32BE
227 {-# INLINE decodeUtf32BE #-}
231 These are simply defined
233 > encodeUtf8 = yield . TE.encodeUtf8
235 They are intended for use with 'for'
237 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
239 which would have the effect of
241 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
243 using the encoding functions from Data.Text.Encoding
246 encodeUtf8 :: Monad m => Text -> Producer ByteString m ()
247 encodeUtf8 = yield . TE.encodeUtf8
248 encodeUtf16LE :: Monad m => Text -> Producer ByteString m ()
249 encodeUtf16LE = yield . TE.encodeUtf16LE
250 encodeUtf16BE :: Monad m => Text -> Producer ByteString m ()
251 encodeUtf16BE = yield . TE.encodeUtf16BE
252 encodeUtf32LE :: Monad m => Text -> Producer ByteString m ()
253 encodeUtf32LE = yield . TE.encodeUtf32LE
254 encodeUtf32BE :: Monad m => Text -> Producer ByteString m ()
255 encodeUtf32BE = yield . TE.encodeUtf32BE
257 mkCodec :: (forall r m . Monad m =>
258 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
259 -> (Text -> ByteString)
261 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
266 ascii and latin encodings only use a small number of the characters 'Text'
267 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
268 Rather we simply define functions each way.
272 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
273 -- returning the rest of the 'Text' at the first non-ascii 'Char'
275 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
276 encodeAscii = go where
277 go p = do e <- lift (next p)
279 Left r -> return (return r)
283 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
284 in do yield (B8.pack (T.unpack safe))
287 else return $ do yield unsafe
290 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
291 returning the rest of the 'Text' upon hitting any non-latin 'Char'
293 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
294 encodeIso8859_1 = go where
295 go p = do e <- lift (next p)
297 Left r -> return (return r)
301 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
302 in do yield (B8.pack (T.unpack safe))
305 else return $ do yield unsafe
308 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
309 unused 'ByteString' upon hitting an un-ascii byte.
311 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
312 decodeAscii = go where
313 go p = do e <- lift (next p)
315 Left r -> return (return r)
319 else let (safe, unsafe) = B.span (<= 0x7F) chunk
320 in do yield (T.pack (B8.unpack safe))
323 else return (do yield unsafe
326 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
327 unused 'ByteString' upon hitting the rare un-latinizable byte.
329 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
330 decodeIso8859_1 = go where
331 go p = do e <- lift (next p)
333 Left r -> return (return r)
337 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
338 yield (T.pack (B8.unpack safe))
341 else return (do yield unsafe