]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
added eoflens to discriminate whether decoding was completed
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
0ac0c414 3-- | This module uses the stream decoding functions from Michael Snoyman's new
4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
4ea59a8b 5-- package to define decoding functions and lenses. The exported names
6-- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7-- imported unqualified.
bbdfd305 8
9module Pipes.Text.Encoding
fafcbeb5 10 (
0ac0c414 11 -- * The Lens or Codec type
fafcbeb5 12 -- $lenses
13 Codec
4ea59a8b 14 , decode
6c2fffdc 15 , eof
a4913c42 16 -- * \'Viewing\' the Text in a byte stream
fafcbeb5 17 -- $codecs
bbdfd305 18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
fafcbeb5 24 -- * Non-lens decoding functions
0ac0c414 25 -- $decoders
89d80557 26 , decodeUtf8
27 , decodeUtf8Pure
28 , decodeUtf16LE
29 , decodeUtf16BE
30 , decodeUtf32LE
31 , decodeUtf32BE
0ac0c414 32 -- * Re-encoding functions
33 -- $encoders
34 , encodeUtf8
35 , encodeUtf16LE
36 , encodeUtf16BE
37 , encodeUtf32LE
38 , encodeUtf32BE
fafcbeb5 39 -- * Functions for latin and ascii text
40 -- $ascii
bbdfd305 41 , encodeAscii
42 , decodeAscii
43 , encodeIso8859_1
44 , decodeIso8859_1
45 )
46 where
47
0ac0c414 48import Data.Functor.Constant (Constant(..))
bbdfd305 49import Data.Char (ord)
50import Data.ByteString as B
51import Data.ByteString (ByteString)
bbdfd305 52import Data.ByteString.Char8 as B8
53import Data.Text (Text)
54import qualified Data.Text as T
55import qualified Data.Text.Encoding as TE
eae50557 56import qualified Data.Streaming.Text as Stream
57import Data.Streaming.Text (DecodeResult(..))
70125641 58import Control.Monad (join)
89d80557 59import Data.Word (Word8)
bbdfd305 60import Pipes
bbdfd305 61
2f4a83f8 62type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
bbdfd305 63
fafcbeb5 64{- $lenses
0ac0c414 65 The 'Codec' type is a simple specializion of
2f4a83f8 66 the @Lens'@ type synonymn used by the standard lens libraries,
0ac0c414 67 <http://hackage.haskell.org/package/lens lens> and
68 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
fafcbeb5 69
2f4a83f8 70> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
0ac0c414 71
4ea59a8b 72 is just an alias for a Prelude type. Thus you use any particular codec with
73 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
74 we presuppose neither since we already have access to the types they require.
0ac0c414 75
fafcbeb5 76 -}
77
21eb409c 78type Codec
d199072b 79 = forall m r
21eb409c 80 . Monad m
2f4a83f8 81 => Lens' (Producer ByteString m r)
d199072b 82 (Producer Text m (Producer ByteString m r))
83
0ac0c414 84{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
4ea59a8b 85 exported here under a name appropriate to the material. All of these are
86 the same:
0ac0c414 87
4ea59a8b 88> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
0ac0c414 89
90-}
91
6c2fffdc 92
0ac0c414 93decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
94decode codec a = getConstant (codec Constant a)
95
6c2fffdc 96{- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
97 reaches end-of-file happily. (Without it one just makes an explicit test
98 for emptiness of the resulting bytestring production using 'next')
99 Thus
100
101> decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof
102
103 will be a text producer. If we hit undecodable bytes, the remaining
104 bytestring producer will be returned as a 'Left' value;
105 in the happy case, a 'Right' value is returned with the anticipated
106 return value for the original bytestring producer.
107 )
108
109-}
110
111eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
112 (Producer Text m (Either (Producer ByteString m r) r))
113eof k p = fmap fromEither (k (toEither p)) where
114
115 fromEither = liftM (either id return)
116
117 toEither pp = do p <- pp
118 check p
119
120 check p = do e <- lift (next p)
121 case e of
122 Left r -> return (Right r)
123 Right (bs,pb) -> if B.null bs
124 then check pb
125 else return (Left (do yield bs
126 pb))
127
bbdfd305 128
4ea59a8b 129{- $codecs
130
131 Each Codec-lens looks into a byte stream that is supposed to contain text.
132 The particular \'Codec\' lenses are named in accordance with the expected
133 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
134 use @view@ / @(^.)@ -- here also called 'decode':
135
136> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
137> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
138> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
139
140 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
141 decoding functions exported here, e.g.
142
143> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
144> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
145
146 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
147 At any point of decoding failure, the stream of text ends and reverts to (returns)
148 the original byte stream. Thus if the first bytes are already
149 un-decodable, the whole ByteString producer will be returned, i.e.
150
151> view utf8 bytestream
152
153 will just come to the same as
154
155> return bytestream
156
157 Where there is no decoding failure, the return value of the text stream will be
158 an empty byte stream followed by its own return value. In all cases you must
159 deal with the fact that it is a /ByteString producer/ that is returned, even if
160 it can be thrown away with @Control.Monad.void@
161
162> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
163
164 @zoom@ converts a Text parser into a ByteString parser:
165
166> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
167
a4913c42 168 or, using the type synonymn from @Pipes.Parse@:
4ea59a8b 169
170> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
171
a4913c42 172 Thus we can define a ByteString parser like this:
4ea59a8b 173
174> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
175> withNextByte = do char_ <- zoom utf8 Text.drawChar
176> byte_ <- Bytes.peekByte
177> return (char_, byte_)
178
179 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
180 but it is a ByteString parser; it will return the first valid utf8-encoded
181 Char in a ByteString, whatever its length,
182 and the first byte of the next character, if they exist. Because
183 we \'draw\' one and \'peek\' at the other, the parser as a whole only
184 advances one Char's length along the bytestring, whatever that length may be.
185 See the slightly more complex example \'decode.hs\' in the
186 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
187 discussion of this type of byte stream parsing.
188 -}
189
190utf8 :: Codec
191utf8 = mkCodec decodeUtf8 TE.encodeUtf8
192
193utf8Pure :: Codec
194utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
195
196utf16LE :: Codec
197utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
198
199utf16BE :: Codec
200utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
201
202utf32LE :: Codec
203utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
204
205utf32BE :: Codec
206utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
207
bbdfd305 208decodeStream :: Monad m
209 => (B.ByteString -> DecodeResult)
210 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
211decodeStream = loop where
212 loop dec0 p =
213 do x <- lift (next p)
214 case x of Left r -> return (return r)
215 Right (chunk, p') -> case dec0 chunk of
216 DecodeResultSuccess text dec -> do yield text
217 loop dec p'
218 DecodeResultFailure text bs -> do yield text
219 return (do yield bs
220 p')
221{-# INLINABLE decodeStream#-}
222
6c2fffdc 223
0ac0c414 224{- $decoders
225 These are functions with the simple type:
226
227> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
228
229 Thus in general
230
231> decodeUtf8 = view utf8
232> decodeUtf16LE = view utf16LE
fafcbeb5 233
0ac0c414 234 and so forth, but these forms
235 may be more convenient (and give better type errors!) where lenses are
236 not desired.
237-}
fafcbeb5 238
239
bbdfd305 240decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 241decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305 242{-# INLINE decodeUtf8 #-}
243
244decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 245decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305 246{-# INLINE decodeUtf8Pure #-}
247
248decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 249decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305 250{-# INLINE decodeUtf16LE #-}
251
252decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 253decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305 254{-# INLINE decodeUtf16BE #-}
255
256decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 257decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305 258{-# INLINE decodeUtf32LE #-}
259
260decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 261decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305 262{-# INLINE decodeUtf32BE #-}
263
0ac0c414 264
265{- $encoders
266 These are simply defined
267
268> encodeUtf8 = yield . TE.encodeUtf8
269
270 They are intended for use with 'for'
271
272> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
273
274 which would have the effect of
275
276> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
277
278 using the encoding functions from Data.Text.Encoding
279-}
280
10cfd90e 281encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 282encodeUtf8 = yield . TE.encodeUtf8
10cfd90e 283encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 284encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e 285encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 286encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e 287encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 288encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e 289encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 290encodeUtf32BE = yield . TE.encodeUtf32BE
291
bbdfd305 292mkCodec :: (forall r m . Monad m =>
293 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
294 -> (Text -> ByteString)
295 -> Codec
296mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
297
298
bbdfd305 299
fafcbeb5 300{- $ascii
301 ascii and latin encodings only use a small number of the characters 'Text'
302 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 303 Rather we simply define functions each way.
bbdfd305 304-}
305
fafcbeb5 306
0ac0c414 307-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 308-- returning the rest of the 'Text' at the first non-ascii 'Char'
309
bbdfd305 310encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
311encodeAscii = go where
312 go p = do e <- lift (next p)
313 case e of
314 Left r -> return (return r)
315 Right (chunk, p') ->
316 if T.null chunk
317 then go p'
318 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
319 in do yield (B8.pack (T.unpack safe))
320 if T.null unsafe
321 then go p'
322 else return $ do yield unsafe
323 p'
324
325{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
326 returning the rest of the 'Text' upon hitting any non-latin 'Char'
327 -}
328encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
329encodeIso8859_1 = go where
330 go p = do e <- lift (next p)
331 case e of
332 Left r -> return (return r)
333 Right (txt, p') ->
334 if T.null txt
335 then go p'
336 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
337 in do yield (B8.pack (T.unpack safe))
338 if T.null unsafe
339 then go p'
340 else return $ do yield unsafe
341 p'
342
343{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
344 unused 'ByteString' upon hitting an un-ascii byte.
345 -}
346decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
347decodeAscii = go where
348 go p = do e <- lift (next p)
349 case e of
350 Left r -> return (return r)
351 Right (chunk, p') ->
352 if B.null chunk
353 then go p'
354 else let (safe, unsafe) = B.span (<= 0x7F) chunk
355 in do yield (T.pack (B8.unpack safe))
356 if B.null unsafe
357 then go p'
358 else return (do yield unsafe
359 p')
360
361{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
362 unused 'ByteString' upon hitting the rare un-latinizable byte.
363 -}
364decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
365decodeIso8859_1 = go where
366 go p = do e <- lift (next p)
367 case e of
368 Left r -> return (return r)
369 Right (chunk, p') ->
370 if B.null chunk
371 then go p'
372 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
373 yield (T.pack (B8.unpack safe))
374 if B.null unsafe
375 then go p'
376 else return (do yield unsafe
377 p')
378
379
380