]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
encoding documentation
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
b091cbeb 3-- | This module uses the stream decoding functions from
edd8726a 4-- <http://hackage.haskell.org/package/streaming-commons streaming-commons>
4ea59a8b 5-- package to define decoding functions and lenses. The exported names
b091cbeb 6-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
bbdfd305 7
8module Pipes.Text.Encoding
fafcbeb5 9 (
edd8726a 10 -- * Decoding ByteStrings and Encoding Texts
11 -- ** Simple usage
12 -- $usage
13
14 -- ** Lens usage
fafcbeb5 15 -- $lenses
edd8726a 16
17
18 -- * Basic lens operations
fafcbeb5 19 Codec
4ea59a8b 20 , decode
6c2fffdc 21 , eof
edd8726a 22 -- * Decoding lenses
bbdfd305 23 , utf8
24 , utf8Pure
25 , utf16LE
26 , utf16BE
27 , utf32LE
28 , utf32BE
fafcbeb5 29 -- * Non-lens decoding functions
0ac0c414 30 -- $decoders
89d80557 31 , decodeUtf8
32 , decodeUtf8Pure
33 , decodeUtf16LE
34 , decodeUtf16BE
35 , decodeUtf32LE
36 , decodeUtf32BE
0ac0c414 37 -- * Re-encoding functions
38 -- $encoders
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
fafcbeb5 44 -- * Functions for latin and ascii text
45 -- $ascii
bbdfd305 46 , encodeAscii
47 , decodeAscii
48 , encodeIso8859_1
49 , decodeIso8859_1
50 )
51 where
52
0ac0c414 53import Data.Functor.Constant (Constant(..))
bbdfd305 54import Data.Char (ord)
55import Data.ByteString as B
56import Data.ByteString (ByteString)
bbdfd305 57import Data.ByteString.Char8 as B8
58import Data.Text (Text)
59import qualified Data.Text as T
60import qualified Data.Text.Encoding as TE
eae50557 61import qualified Data.Streaming.Text as Stream
62import Data.Streaming.Text (DecodeResult(..))
b091cbeb 63import Control.Monad (join, liftM)
89d80557 64import Data.Word (Word8)
bbdfd305 65import Pipes
bbdfd305 66
bbdfd305 67
0ac0c414 68
edd8726a 69{- $usage
70 Given
fafcbeb5 71
edd8726a 72> text :: Producer Text IO ()
d199072b 73
edd8726a 74 we can encode it with @Data.Text.Encoding@ and ordinary pipe operations:
0ac0c414 75
edd8726a 76> text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
0ac0c414 77
edd8726a 78 or, using this module, with
0ac0c414 79
edd8726a 80> for text encodeUtf8 :: Producer.ByteString IO ()
6c2fffdc 81
edd8726a 82 Given
83
84> bytes :: Producer ByteString Text IO ()
0ac0c414 85
edd8726a 86 we can apply a decoding function from this module:
6c2fffdc 87
edd8726a 88> decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
89
90 The Text producer ends wherever decoding first fails. Thus we can re-encode
91 as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
92
93> for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
94
95 The bytestring producer that is returned begins with where utf16BE decoding
96 failed; it it didn't fail the producer is empty.
6c2fffdc 97
98-}
99
edd8726a 100{- $lenses
101 We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@
102 that looks for text in an appropriately encoded byte stream.
6c2fffdc 103
edd8726a 104> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
6c2fffdc 105
edd8726a 106 is just an alias for a Prelude type. We abbreviate this further, for our use case, as
6c2fffdc 107
edd8726a 108> type Codec
109> = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
6c2fffdc 110
edd8726a 111 and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
112 re-encode what they have decoded. Thus you use any particular codec with
113 the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
114 we presuppose neither <http://hackage.haskell.org/package/lens lens>
115 nor <http://hackage.haskell.org/package/lens-family lens-family>
116 since we already have access to the types they require.
bbdfd305 117
edd8726a 118 Each decoding lens looks into a byte stream that is supposed to contain text.
119 The particular lenses are named in accordance with the expected
120 encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
121 into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
4ea59a8b 122
123> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
124> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
125> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
126
edd8726a 127 These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by
128 the specialized decoding functions exported here, e.g.
4ea59a8b 129
130> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
131> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
132
edd8726a 133 As with these functions, the stream of text that a @Codec@ \'sees\'
134 in the stream of bytes begins at its head.
4ea59a8b 135 At any point of decoding failure, the stream of text ends and reverts to (returns)
136 the original byte stream. Thus if the first bytes are already
137 un-decodable, the whole ByteString producer will be returned, i.e.
138
edd8726a 139> view utf8 bad_bytestream
4ea59a8b 140
141 will just come to the same as
142
edd8726a 143> return bad_bytestream
4ea59a8b 144
145 Where there is no decoding failure, the return value of the text stream will be
146 an empty byte stream followed by its own return value. In all cases you must
147 deal with the fact that it is a /ByteString producer/ that is returned, even if
148 it can be thrown away with @Control.Monad.void@
149
150> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
edd8726a 151
152 The @eof@ lens permits you to pattern match: if there is a Right value,
153 it is the leftover bytestring producer, if there is a Right value, it
154 is the return value of the original bytestring producer:
155
156> Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
4ea59a8b 157
edd8726a 158 Thus for the stream of un-decodable bytes mentioned above,
159
160> view (utf8 . eof) bad_bytestream
161
162 will be the same as
163
164> return (Left bad_bytestream)
165
4ea59a8b 166 @zoom@ converts a Text parser into a ByteString parser:
167
168> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
169
a4913c42 170 or, using the type synonymn from @Pipes.Parse@:
4ea59a8b 171
172> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
173
edd8726a 174 Thus we can define a ByteString parser (in the pipes-parse sense) like this:
4ea59a8b 175
edd8726a 176> charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
177> charPlusByte = do char_ <- zoom utf8 Text.drawChar
4ea59a8b 178> byte_ <- Bytes.peekByte
179> return (char_, byte_)
180
edd8726a 181 Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
4ea59a8b 182 but it is a ByteString parser; it will return the first valid utf8-encoded
edd8726a 183 Char in a ByteString, whatever its byte-length,
184 and the first byte following, if both exist. Because
4ea59a8b 185 we \'draw\' one and \'peek\' at the other, the parser as a whole only
186 advances one Char's length along the bytestring, whatever that length may be.
187 See the slightly more complex example \'decode.hs\' in the
edd8726a 188 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
189 discussion of this type of byte stream parsing.
4ea59a8b 190 -}
191
edd8726a 192type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
193
194type Codec
195 = forall m r
196 . Monad m
197 => Lens' (Producer ByteString m r)
198 (Producer Text m (Producer ByteString m r))
199
200
201{- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
202 exported here under a name appropriate to the material. Thus
203
204> decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
205
206 All of these are thus the same:
207
208> decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
209
210
211-}
212
213decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
214decode codec a = getConstant (codec Constant a)
215
216{- | @eof@ tells you explicitly when decoding stops due to bad bytes or
217 instead reaches end-of-file happily. (Without it one just makes an explicit
218 test for emptiness of the resulting bytestring production using next) Thus
219
220> decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
221
222 If we hit undecodable bytes, the remaining bytestring producer will be
223 returned as a Left value; in the happy case, a Right value is returned
224 with the anticipated return value for the original bytestring producer.
225
226 Again, all of these are the same
227
228> decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
229
230-}
231
232eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
233 (Producer Text m (Either (Producer ByteString m r) r))
234eof k p = fmap fromEither (k (toEither p)) where
235
236 fromEither = liftM (either id return)
237
238 toEither pp = do p <- pp
239 check p
240
241 check p = do e <- lift (next p)
242 case e of
243 Left r -> return (Right r)
244 Right (bs,pb) -> if B.null bs
245 then check pb
246 else return (Left (do yield bs
247 pb))
248
4ea59a8b 249utf8 :: Codec
250utf8 = mkCodec decodeUtf8 TE.encodeUtf8
251
252utf8Pure :: Codec
253utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
254
255utf16LE :: Codec
256utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
257
258utf16BE :: Codec
259utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
260
261utf32LE :: Codec
262utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
263
264utf32BE :: Codec
265utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
266
bbdfd305 267decodeStream :: Monad m
268 => (B.ByteString -> DecodeResult)
269 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
270decodeStream = loop where
271 loop dec0 p =
272 do x <- lift (next p)
273 case x of Left r -> return (return r)
274 Right (chunk, p') -> case dec0 chunk of
275 DecodeResultSuccess text dec -> do yield text
276 loop dec p'
277 DecodeResultFailure text bs -> do yield text
278 return (do yield bs
279 p')
280{-# INLINABLE decodeStream#-}
281
6c2fffdc 282
0ac0c414 283{- $decoders
284 These are functions with the simple type:
285
286> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
287
288 Thus in general
289
290> decodeUtf8 = view utf8
291> decodeUtf16LE = view utf16LE
fafcbeb5 292
0ac0c414 293 and so forth, but these forms
294 may be more convenient (and give better type errors!) where lenses are
295 not desired.
296-}
fafcbeb5 297
298
bbdfd305 299decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 300decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305 301{-# INLINE decodeUtf8 #-}
302
303decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 304decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305 305{-# INLINE decodeUtf8Pure #-}
306
307decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 308decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305 309{-# INLINE decodeUtf16LE #-}
310
311decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 312decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305 313{-# INLINE decodeUtf16BE #-}
314
315decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 316decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305 317{-# INLINE decodeUtf32LE #-}
318
319decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 320decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305 321{-# INLINE decodeUtf32BE #-}
322
0ac0c414 323
324{- $encoders
325 These are simply defined
326
327> encodeUtf8 = yield . TE.encodeUtf8
328
329 They are intended for use with 'for'
330
331> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
332
333 which would have the effect of
334
335> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
336
337 using the encoding functions from Data.Text.Encoding
338-}
339
10cfd90e 340encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 341encodeUtf8 = yield . TE.encodeUtf8
10cfd90e 342encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 343encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e 344encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 345encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e 346encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 347encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e 348encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 349encodeUtf32BE = yield . TE.encodeUtf32BE
350
bbdfd305 351mkCodec :: (forall r m . Monad m =>
352 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
353 -> (Text -> ByteString)
354 -> Codec
355mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
356
357
bbdfd305 358
fafcbeb5 359{- $ascii
360 ascii and latin encodings only use a small number of the characters 'Text'
361 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 362 Rather we simply define functions each way.
bbdfd305 363-}
364
fafcbeb5 365
0ac0c414 366-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 367-- returning the rest of the 'Text' at the first non-ascii 'Char'
368
bbdfd305 369encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
370encodeAscii = go where
371 go p = do e <- lift (next p)
372 case e of
373 Left r -> return (return r)
374 Right (chunk, p') ->
375 if T.null chunk
376 then go p'
377 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
378 in do yield (B8.pack (T.unpack safe))
379 if T.null unsafe
380 then go p'
381 else return $ do yield unsafe
382 p'
383
384{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
385 returning the rest of the 'Text' upon hitting any non-latin 'Char'
386 -}
387encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
388encodeIso8859_1 = go where
389 go p = do e <- lift (next p)
390 case e of
391 Left r -> return (return r)
392 Right (txt, p') ->
393 if T.null txt
394 then go p'
395 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
396 in do yield (B8.pack (T.unpack safe))
397 if T.null unsafe
398 then go p'
399 else return $ do yield unsafe
400 p'
401
402{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
403 unused 'ByteString' upon hitting an un-ascii byte.
404 -}
405decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
406decodeAscii = go where
407 go p = do e <- lift (next p)
408 case e of
409 Left r -> return (return r)
410 Right (chunk, p') ->
411 if B.null chunk
412 then go p'
413 else let (safe, unsafe) = B.span (<= 0x7F) chunk
414 in do yield (T.pack (B8.unpack safe))
415 if B.null unsafe
416 then go p'
417 else return (do yield unsafe
418 p')
419
420{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
421 unused 'ByteString' upon hitting the rare un-latinizable byte.
422 -}
423decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
424decodeIso8859_1 = go where
425 go p = do e <- lift (next p)
426 case e of
427 Left r -> return (return r)
428 Right (chunk, p') ->
429 if B.null chunk
430 then go p'
431 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
432 yield (T.pack (B8.unpack safe))
433 if B.null unsafe
434 then go p'
435 else return (do yield unsafe
436 p')
437
438
439