]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
detritus
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
b091cbeb 3-- | This module uses the stream decoding functions from
edd8726a 4-- <http://hackage.haskell.org/package/streaming-commons streaming-commons>
4ea59a8b 5-- package to define decoding functions and lenses. The exported names
b091cbeb 6-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
bbdfd305 7
8module Pipes.Text.Encoding
fafcbeb5 9 (
edd8726a 10 -- * Decoding ByteStrings and Encoding Texts
11 -- ** Simple usage
12 -- $usage
13
14 -- ** Lens usage
fafcbeb5 15 -- $lenses
edd8726a 16
17
18 -- * Basic lens operations
fafcbeb5 19 Codec
4ea59a8b 20 , decode
6c2fffdc 21 , eof
edd8726a 22 -- * Decoding lenses
bbdfd305 23 , utf8
24 , utf8Pure
25 , utf16LE
26 , utf16BE
27 , utf32LE
28 , utf32BE
fafcbeb5 29 -- * Non-lens decoding functions
0ac0c414 30 -- $decoders
89d80557 31 , decodeUtf8
32 , decodeUtf8Pure
33 , decodeUtf16LE
34 , decodeUtf16BE
35 , decodeUtf32LE
36 , decodeUtf32BE
0ac0c414 37 -- * Re-encoding functions
38 -- $encoders
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
fafcbeb5 44 -- * Functions for latin and ascii text
45 -- $ascii
bbdfd305 46 , encodeAscii
47 , decodeAscii
48 , encodeIso8859_1
49 , decodeIso8859_1
50 )
51 where
52
0ac0c414 53import Data.Functor.Constant (Constant(..))
bbdfd305 54import Data.Char (ord)
55import Data.ByteString as B
bbdfd305 56import Data.ByteString.Char8 as B8
57import Data.Text (Text)
58import qualified Data.Text as T
59import qualified Data.Text.Encoding as TE
eae50557 60import qualified Data.Streaming.Text as Stream
61import Data.Streaming.Text (DecodeResult(..))
b091cbeb 62import Control.Monad (join, liftM)
bbdfd305 63import Pipes
bbdfd305 64
bbdfd305 65
0ac0c414 66
edd8726a 67{- $usage
68 Given
fafcbeb5 69
edd8726a 70> text :: Producer Text IO ()
d199072b 71
edd8726a 72 we can encode it with @Data.Text.Encoding@ and ordinary pipe operations:
0ac0c414 73
edd8726a 74> text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
0ac0c414 75
edd8726a 76 or, using this module, with
0ac0c414 77
edd8726a 78> for text encodeUtf8 :: Producer.ByteString IO ()
6c2fffdc 79
edd8726a 80 Given
81
82> bytes :: Producer ByteString Text IO ()
0ac0c414 83
edd8726a 84 we can apply a decoding function from this module:
6c2fffdc 85
edd8726a 86> decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
87
88 The Text producer ends wherever decoding first fails. Thus we can re-encode
89 as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
90
91> for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
92
93 The bytestring producer that is returned begins with where utf16BE decoding
e8336ba6 94 failed; if it didn't fail the producer is empty.
6c2fffdc 95
96-}
97
edd8726a 98{- $lenses
99 We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@
100 that looks for text in an appropriately encoded byte stream.
6c2fffdc 101
edd8726a 102> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
6c2fffdc 103
edd8726a 104 is just an alias for a Prelude type. We abbreviate this further, for our use case, as
6c2fffdc 105
edd8726a 106> type Codec
107> = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
6c2fffdc 108
edd8726a 109 and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
110 re-encode what they have decoded. Thus you use any particular codec with
111 the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
112 we presuppose neither <http://hackage.haskell.org/package/lens lens>
113 nor <http://hackage.haskell.org/package/lens-family lens-family>
114 since we already have access to the types they require.
bbdfd305 115
edd8726a 116 Each decoding lens looks into a byte stream that is supposed to contain text.
117 The particular lenses are named in accordance with the expected
118 encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
119 into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
4ea59a8b 120
121> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
122> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
123> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
124
edd8726a 125 These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by
126 the specialized decoding functions exported here, e.g.
4ea59a8b 127
128> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
129> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
130
edd8726a 131 As with these functions, the stream of text that a @Codec@ \'sees\'
132 in the stream of bytes begins at its head.
4ea59a8b 133 At any point of decoding failure, the stream of text ends and reverts to (returns)
134 the original byte stream. Thus if the first bytes are already
135 un-decodable, the whole ByteString producer will be returned, i.e.
136
edd8726a 137> view utf8 bad_bytestream
4ea59a8b 138
139 will just come to the same as
140
edd8726a 141> return bad_bytestream
4ea59a8b 142
143 Where there is no decoding failure, the return value of the text stream will be
144 an empty byte stream followed by its own return value. In all cases you must
145 deal with the fact that it is a /ByteString producer/ that is returned, even if
146 it can be thrown away with @Control.Monad.void@
147
148> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
edd8726a 149
150 The @eof@ lens permits you to pattern match: if there is a Right value,
151 it is the leftover bytestring producer, if there is a Right value, it
152 is the return value of the original bytestring producer:
153
154> Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
4ea59a8b 155
edd8726a 156 Thus for the stream of un-decodable bytes mentioned above,
157
158> view (utf8 . eof) bad_bytestream
159
160 will be the same as
161
162> return (Left bad_bytestream)
163
4ea59a8b 164 @zoom@ converts a Text parser into a ByteString parser:
165
166> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
167
a4913c42 168 or, using the type synonymn from @Pipes.Parse@:
4ea59a8b 169
170> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
171
edd8726a 172 Thus we can define a ByteString parser (in the pipes-parse sense) like this:
4ea59a8b 173
edd8726a 174> charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
175> charPlusByte = do char_ <- zoom utf8 Text.drawChar
4ea59a8b 176> byte_ <- Bytes.peekByte
177> return (char_, byte_)
178
edd8726a 179 Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
4ea59a8b 180 but it is a ByteString parser; it will return the first valid utf8-encoded
edd8726a 181 Char in a ByteString, whatever its byte-length,
182 and the first byte following, if both exist. Because
4ea59a8b 183 we \'draw\' one and \'peek\' at the other, the parser as a whole only
184 advances one Char's length along the bytestring, whatever that length may be.
185 See the slightly more complex example \'decode.hs\' in the
edd8726a 186 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
187 discussion of this type of byte stream parsing.
4ea59a8b 188 -}
189
edd8726a 190type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
191
192type Codec
193 = forall m r
194 . Monad m
195 => Lens' (Producer ByteString m r)
196 (Producer Text m (Producer ByteString m r))
197
198
199{- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
200 exported here under a name appropriate to the material. Thus
201
202> decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
203
204 All of these are thus the same:
205
206> decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
207
208
209-}
210
211decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
212decode codec a = getConstant (codec Constant a)
213
214{- | @eof@ tells you explicitly when decoding stops due to bad bytes or
215 instead reaches end-of-file happily. (Without it one just makes an explicit
216 test for emptiness of the resulting bytestring production using next) Thus
217
218> decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
219
220 If we hit undecodable bytes, the remaining bytestring producer will be
221 returned as a Left value; in the happy case, a Right value is returned
222 with the anticipated return value for the original bytestring producer.
223
224 Again, all of these are the same
225
226> decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
227
228-}
229
230eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
231 (Producer Text m (Either (Producer ByteString m r) r))
e8336ba6 232eof k p0 = fmap fromEither (k (toEither p0)) where
edd8726a 233
234 fromEither = liftM (either id return)
235
236 toEither pp = do p <- pp
237 check p
238
239 check p = do e <- lift (next p)
240 case e of
241 Left r -> return (Right r)
242 Right (bs,pb) -> if B.null bs
243 then check pb
244 else return (Left (do yield bs
245 pb))
246
4ea59a8b 247utf8 :: Codec
248utf8 = mkCodec decodeUtf8 TE.encodeUtf8
249
250utf8Pure :: Codec
251utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
252
253utf16LE :: Codec
254utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
255
256utf16BE :: Codec
257utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
258
259utf32LE :: Codec
260utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
261
262utf32BE :: Codec
263utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
264
bbdfd305 265decodeStream :: Monad m
266 => (B.ByteString -> DecodeResult)
267 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
268decodeStream = loop where
269 loop dec0 p =
270 do x <- lift (next p)
e8336ba6 271 case x of
272 Left r -> return (return r)
273 Right (chunk, p') -> case dec0 chunk of
274 DecodeResultSuccess text dec -> do yield text
275 loop dec p'
276 DecodeResultFailure text bs -> do yield text
277 return (do yield bs
278 p')
bbdfd305 279{-# INLINABLE decodeStream#-}
280
6c2fffdc 281
0ac0c414 282{- $decoders
283 These are functions with the simple type:
284
285> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
286
287 Thus in general
288
289> decodeUtf8 = view utf8
290> decodeUtf16LE = view utf16LE
fafcbeb5 291
0ac0c414 292 and so forth, but these forms
293 may be more convenient (and give better type errors!) where lenses are
294 not desired.
295-}
fafcbeb5 296
297
bbdfd305 298decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 299decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305 300{-# INLINE decodeUtf8 #-}
301
302decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 303decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305 304{-# INLINE decodeUtf8Pure #-}
305
306decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 307decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305 308{-# INLINE decodeUtf16LE #-}
309
310decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 311decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305 312{-# INLINE decodeUtf16BE #-}
313
314decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 315decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305 316{-# INLINE decodeUtf32LE #-}
317
318decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 319decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305 320{-# INLINE decodeUtf32BE #-}
321
0ac0c414 322
323{- $encoders
324 These are simply defined
325
326> encodeUtf8 = yield . TE.encodeUtf8
327
328 They are intended for use with 'for'
329
330> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
331
332 which would have the effect of
333
334> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
335
336 using the encoding functions from Data.Text.Encoding
337-}
338
10cfd90e 339encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 340encodeUtf8 = yield . TE.encodeUtf8
10cfd90e 341encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 342encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e 343encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 344encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e 345encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 346encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e 347encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 348encodeUtf32BE = yield . TE.encodeUtf32BE
349
bbdfd305 350mkCodec :: (forall r m . Monad m =>
351 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
352 -> (Text -> ByteString)
353 -> Codec
354mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
355
356
bbdfd305 357
fafcbeb5 358{- $ascii
359 ascii and latin encodings only use a small number of the characters 'Text'
360 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 361 Rather we simply define functions each way.
bbdfd305 362-}
363
fafcbeb5 364
0ac0c414 365-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 366-- returning the rest of the 'Text' at the first non-ascii 'Char'
367
bbdfd305 368encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
369encodeAscii = go where
370 go p = do e <- lift (next p)
371 case e of
372 Left r -> return (return r)
373 Right (chunk, p') ->
374 if T.null chunk
375 then go p'
376 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
377 in do yield (B8.pack (T.unpack safe))
378 if T.null unsafe
379 then go p'
380 else return $ do yield unsafe
381 p'
382
383{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
384 returning the rest of the 'Text' upon hitting any non-latin 'Char'
385 -}
386encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
387encodeIso8859_1 = go where
388 go p = do e <- lift (next p)
389 case e of
390 Left r -> return (return r)
391 Right (txt, p') ->
392 if T.null txt
393 then go p'
394 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
395 in do yield (B8.pack (T.unpack safe))
396 if T.null unsafe
397 then go p'
398 else return $ do yield unsafe
399 p'
400
401{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
402 unused 'ByteString' upon hitting an un-ascii byte.
403 -}
404decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
405decodeAscii = go where
406 go p = do e <- lift (next p)
407 case e of
408 Left r -> return (return r)
409 Right (chunk, p') ->
410 if B.null chunk
411 then go p'
412 else let (safe, unsafe) = B.span (<= 0x7F) chunk
413 in do yield (T.pack (B8.unpack safe))
414 if B.null unsafe
415 then go p'
416 else return (do yield unsafe
417 p')
418
419{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
420 unused 'ByteString' upon hitting the rare un-latinizable byte.
421 -}
422decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
423decodeIso8859_1 = go where
424 go p = do e <- lift (next p)
425 case e of
426 Left r -> return (return r)
427 Right (chunk, p') ->
428 if B.null chunk
429 then go p'
430 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
431 yield (T.pack (B8.unpack safe))
432 if B.null unsafe
433 then go p'
434 else return (do yield unsafe
435 p')
436
437
438