]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
commentary
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
b091cbeb 3-- | This module uses the stream decoding functions from
edd8726a 4-- <http://hackage.haskell.org/package/streaming-commons streaming-commons>
4ea59a8b 5-- package to define decoding functions and lenses. The exported names
b091cbeb 6-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
bbdfd305 7
8module Pipes.Text.Encoding
fafcbeb5 9 (
edd8726a 10 -- * Decoding ByteStrings and Encoding Texts
11 -- ** Simple usage
12 -- $usage
13
14 -- ** Lens usage
fafcbeb5 15 -- $lenses
edd8726a 16
17
18 -- * Basic lens operations
fafcbeb5 19 Codec
4ea59a8b 20 , decode
6c2fffdc 21 , eof
edd8726a 22 -- * Decoding lenses
bbdfd305 23 , utf8
24 , utf8Pure
25 , utf16LE
26 , utf16BE
27 , utf32LE
28 , utf32BE
fafcbeb5 29 -- * Non-lens decoding functions
0ac0c414 30 -- $decoders
89d80557 31 , decodeUtf8
32 , decodeUtf8Pure
33 , decodeUtf16LE
34 , decodeUtf16BE
35 , decodeUtf32LE
36 , decodeUtf32BE
0ac0c414 37 -- * Re-encoding functions
38 -- $encoders
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
fafcbeb5 44 -- * Functions for latin and ascii text
45 -- $ascii
bbdfd305 46 , encodeAscii
47 , decodeAscii
48 , encodeIso8859_1
49 , decodeIso8859_1
50 )
51 where
52
0ac0c414 53import Data.Functor.Constant (Constant(..))
bbdfd305 54import Data.Char (ord)
55import Data.ByteString as B
bbdfd305 56import Data.ByteString.Char8 as B8
57import Data.Text (Text)
58import qualified Data.Text as T
59import qualified Data.Text.Encoding as TE
eae50557 60import qualified Data.Streaming.Text as Stream
61import Data.Streaming.Text (DecodeResult(..))
b091cbeb 62import Control.Monad (join, liftM)
bbdfd305 63import Pipes
bbdfd305 64
bbdfd305 65
0ac0c414 66
edd8726a 67{- $usage
4543ee0d 68 Encoding is of course simple. Given
fafcbeb5 69
edd8726a 70> text :: Producer Text IO ()
d199072b 71
4543ee0d 72 we can encode it with @Data.Text.Encoding.encodeUtf8@
73
74> TE.encodeUtf8 :: Text -> ByteString
75
76 and ordinary pipe operations:
0ac0c414 77
edd8726a 78> text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
0ac0c414 79
4543ee0d 80 or, equivalently
81
82> for text (yield . TE.encodeUtf8)
83
84 But, using this module, we might use
85
86> encodeUtf8 :: Text -> Producer ByteString m ()
87
88 to write
0ac0c414 89
edd8726a 90> for text encodeUtf8 :: Producer.ByteString IO ()
6c2fffdc 91
4543ee0d 92 All of the above come to the same.
93
94
95 Given
edd8726a 96
4543ee0d 97> bytes :: Producer ByteString IO ()
0ac0c414 98
edd8726a 99 we can apply a decoding function from this module:
6c2fffdc 100
edd8726a 101> decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
102
4543ee0d 103 The Text producer ends wherever decoding first fails. The un-decoded
104 material is returned. If we are confident it is of no interest, we can
105 write:
106
107> void $ decodeUtf8 bytes :: Producer Text IO ()
108
109 Thus we can re-encode
edd8726a 110 as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
111
112> for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
113
114 The bytestring producer that is returned begins with where utf16BE decoding
e8336ba6 115 failed; if it didn't fail the producer is empty.
6c2fffdc 116
117-}
118
edd8726a 119{- $lenses
4543ee0d 120 We get a bit more flexibility, particularly in the use of pipes-style "parsers",
121 if we use a lens like @utf8@ or @utf16BE@
122 that focusses on the text in an appropriately encoded byte stream.
6c2fffdc 123
edd8726a 124> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
6c2fffdc 125
4543ee0d 126 is just an alias for a Prelude type. We abbreviate this further, for our use case, as
6c2fffdc 127
edd8726a 128> type Codec
129> = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
6c2fffdc 130
edd8726a 131 and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
132 re-encode what they have decoded. Thus you use any particular codec with
133 the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
4543ee0d 134 <http://hackage.haskell.org/package/lens lens>,
135 <http://hackage.haskell.org/package/lens-family lens-family>,
136 <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the
137 and <http://hackage.haskell.org/package/microlens microlens> packages will all work
138 the same, since we already have access to the types they require.
bbdfd305 139
edd8726a 140 Each decoding lens looks into a byte stream that is supposed to contain text.
141 The particular lenses are named in accordance with the expected
142 encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
143 into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
4ea59a8b 144
145> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
146> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
147> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
148
4543ee0d 149 Of course, we could always do this with the specialized decoding functions, e.g.
4ea59a8b 150
151> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
152> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
153
edd8726a 154 As with these functions, the stream of text that a @Codec@ \'sees\'
155 in the stream of bytes begins at its head.
4ea59a8b 156 At any point of decoding failure, the stream of text ends and reverts to (returns)
157 the original byte stream. Thus if the first bytes are already
158 un-decodable, the whole ByteString producer will be returned, i.e.
159
edd8726a 160> view utf8 bad_bytestream
4ea59a8b 161
162 will just come to the same as
163
edd8726a 164> return bad_bytestream
4ea59a8b 165
166 Where there is no decoding failure, the return value of the text stream will be
167 an empty byte stream followed by its own return value. In all cases you must
168 deal with the fact that it is a /ByteString producer/ that is returned, even if
169 it can be thrown away with @Control.Monad.void@
170
171> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
edd8726a 172
173 The @eof@ lens permits you to pattern match: if there is a Right value,
174 it is the leftover bytestring producer, if there is a Right value, it
175 is the return value of the original bytestring producer:
176
177> Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
4ea59a8b 178
edd8726a 179 Thus for the stream of un-decodable bytes mentioned above,
180
181> view (utf8 . eof) bad_bytestream
182
183 will be the same as
184
185> return (Left bad_bytestream)
186
4543ee0d 187 @zoom utf8@ converts a Text parser into a ByteString parser:
4ea59a8b 188
189> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
190
a4913c42 191 or, using the type synonymn from @Pipes.Parse@:
4ea59a8b 192
193> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
194
edd8726a 195 Thus we can define a ByteString parser (in the pipes-parse sense) like this:
4ea59a8b 196
edd8726a 197> charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
198> charPlusByte = do char_ <- zoom utf8 Text.drawChar
4ea59a8b 199> byte_ <- Bytes.peekByte
200> return (char_, byte_)
201
edd8726a 202 Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
4ea59a8b 203 but it is a ByteString parser; it will return the first valid utf8-encoded
4543ee0d 204 Char in a ByteString, /whatever its byte-length/,
edd8726a 205 and the first byte following, if both exist. Because
4ea59a8b 206 we \'draw\' one and \'peek\' at the other, the parser as a whole only
207 advances one Char's length along the bytestring, whatever that length may be.
208 See the slightly more complex example \'decode.hs\' in the
edd8726a 209 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
210 discussion of this type of byte stream parsing.
4ea59a8b 211 -}
212
edd8726a 213type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
214
215type Codec
216 = forall m r
217 . Monad m
218 => Lens' (Producer ByteString m r)
219 (Producer Text m (Producer ByteString m r))
220
221
222{- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
a31032bb 223 exported here under a name appropriate to the material.
224 Thus given a bytestring producer called @bytes@ we have
edd8726a 225
226> decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
227
228 All of these are thus the same:
229
a31032bb 230> decode utf8 bytes
231> view utf8 bytes
232> bytes ^. utf8
233> decodeUtf8 bytes
edd8726a 234
235-}
236
237decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
238decode codec a = getConstant (codec Constant a)
239
240{- | @eof@ tells you explicitly when decoding stops due to bad bytes or
241 instead reaches end-of-file happily. (Without it one just makes an explicit
242 test for emptiness of the resulting bytestring production using next) Thus
243
244> decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
245
246 If we hit undecodable bytes, the remaining bytestring producer will be
247 returned as a Left value; in the happy case, a Right value is returned
248 with the anticipated return value for the original bytestring producer.
249
a31032bb 250 Given a bytestring producer called @bytes@ all of these will be the same:
edd8726a 251
a31032bb 252> decode (utf8 . eof) bytes
253> view (utf8 . eof) bytes
254> bytes^.utf8.eof
edd8726a 255
256-}
257
4543ee0d 258eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r))
259 (t m (Either (Producer ByteString m r) r))
e8336ba6 260eof k p0 = fmap fromEither (k (toEither p0)) where
edd8726a 261
262 fromEither = liftM (either id return)
263
264 toEither pp = do p <- pp
265 check p
266
267 check p = do e <- lift (next p)
268 case e of
269 Left r -> return (Right r)
270 Right (bs,pb) -> if B.null bs
271 then check pb
272 else return (Left (do yield bs
273 pb))
274
4ea59a8b 275utf8 :: Codec
276utf8 = mkCodec decodeUtf8 TE.encodeUtf8
277
278utf8Pure :: Codec
279utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
280
281utf16LE :: Codec
282utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
283
284utf16BE :: Codec
285utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
286
287utf32LE :: Codec
288utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
289
290utf32BE :: Codec
291utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
292
bbdfd305 293decodeStream :: Monad m
294 => (B.ByteString -> DecodeResult)
295 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
296decodeStream = loop where
297 loop dec0 p =
298 do x <- lift (next p)
e8336ba6 299 case x of
300 Left r -> return (return r)
301 Right (chunk, p') -> case dec0 chunk of
302 DecodeResultSuccess text dec -> do yield text
303 loop dec p'
304 DecodeResultFailure text bs -> do yield text
305 return (do yield bs
306 p')
bbdfd305 307{-# INLINABLE decodeStream#-}
308
6c2fffdc 309
0ac0c414 310{- $decoders
311 These are functions with the simple type:
312
313> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
314
315 Thus in general
316
317> decodeUtf8 = view utf8
318> decodeUtf16LE = view utf16LE
fafcbeb5 319
0ac0c414 320 and so forth, but these forms
321 may be more convenient (and give better type errors!) where lenses are
322 not desired.
323-}
fafcbeb5 324
325
bbdfd305 326decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 327decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305 328{-# INLINE decodeUtf8 #-}
329
330decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 331decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305 332{-# INLINE decodeUtf8Pure #-}
333
334decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 335decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305 336{-# INLINE decodeUtf16LE #-}
337
338decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 339decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305 340{-# INLINE decodeUtf16BE #-}
341
342decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 343decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305 344{-# INLINE decodeUtf32LE #-}
345
346decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 347decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305 348{-# INLINE decodeUtf32BE #-}
349
0ac0c414 350
351{- $encoders
352 These are simply defined
353
354> encodeUtf8 = yield . TE.encodeUtf8
355
356 They are intended for use with 'for'
357
358> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
359
360 which would have the effect of
361
362> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
363
364 using the encoding functions from Data.Text.Encoding
365-}
366
10cfd90e 367encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 368encodeUtf8 = yield . TE.encodeUtf8
10cfd90e 369encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 370encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e 371encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 372encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e 373encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 374encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e 375encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 376encodeUtf32BE = yield . TE.encodeUtf32BE
377
bbdfd305 378mkCodec :: (forall r m . Monad m =>
379 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
380 -> (Text -> ByteString)
381 -> Codec
382mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
383
384
bbdfd305 385
fafcbeb5 386{- $ascii
387 ascii and latin encodings only use a small number of the characters 'Text'
388 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 389 Rather we simply define functions each way.
bbdfd305 390-}
391
fafcbeb5 392
0ac0c414 393-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 394-- returning the rest of the 'Text' at the first non-ascii 'Char'
395
bbdfd305 396encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
397encodeAscii = go where
398 go p = do e <- lift (next p)
399 case e of
400 Left r -> return (return r)
401 Right (chunk, p') ->
402 if T.null chunk
403 then go p'
404 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
405 in do yield (B8.pack (T.unpack safe))
406 if T.null unsafe
407 then go p'
408 else return $ do yield unsafe
409 p'
410
411{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
412 returning the rest of the 'Text' upon hitting any non-latin 'Char'
413 -}
414encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
415encodeIso8859_1 = go where
416 go p = do e <- lift (next p)
417 case e of
418 Left r -> return (return r)
419 Right (txt, p') ->
420 if T.null txt
421 then go p'
422 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
423 in do yield (B8.pack (T.unpack safe))
424 if T.null unsafe
425 then go p'
426 else return $ do yield unsafe
427 p'
428
429{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
430 unused 'ByteString' upon hitting an un-ascii byte.
431 -}
432decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
433decodeAscii = go where
434 go p = do e <- lift (next p)
435 case e of
436 Left r -> return (return r)
437 Right (chunk, p') ->
438 if B.null chunk
439 then go p'
440 else let (safe, unsafe) = B.span (<= 0x7F) chunk
441 in do yield (T.pack (B8.unpack safe))
442 if B.null unsafe
443 then go p'
444 else return (do yield unsafe
445 p')
446
447{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
448 unused 'ByteString' upon hitting the rare un-latinizable byte.
449 -}
450decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
451decodeIso8859_1 = go where
452 go p = do e <- lift (next p)
453 case e of
454 Left r -> return (return r)
455 Right (chunk, p') ->
456 if B.null chunk
457 then go p'
458 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
459 yield (T.pack (B8.unpack safe))
460 if B.null unsafe
461 then go p'
462 else return (do yield unsafe
463 p')
464
465
466