]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
commentary
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2
3 -- | This module uses the stream decoding functions from
4 -- <http://hackage.haskell.org/package/streaming-commons streaming-commons>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
7
8 module Pipes.Text.Encoding
9 (
10 -- * Decoding ByteStrings and Encoding Texts
11 -- ** Simple usage
12 -- $usage
13
14 -- ** Lens usage
15 -- $lenses
16
17
18 -- * Basic lens operations
19 Codec
20 , decode
21 , eof
22 -- * Decoding lenses
23 , utf8
24 , utf8Pure
25 , utf16LE
26 , utf16BE
27 , utf32LE
28 , utf32BE
29 -- * Non-lens decoding functions
30 -- $decoders
31 , decodeUtf8
32 , decodeUtf8Pure
33 , decodeUtf16LE
34 , decodeUtf16BE
35 , decodeUtf32LE
36 , decodeUtf32BE
37 -- * Re-encoding functions
38 -- $encoders
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
44 -- * Functions for latin and ascii text
45 -- $ascii
46 , encodeAscii
47 , decodeAscii
48 , encodeIso8859_1
49 , decodeIso8859_1
50 )
51 where
52
53 import Data.Functor.Constant (Constant(..))
54 import Data.Char (ord)
55 import Data.ByteString as B
56 import Data.ByteString.Char8 as B8
57 import Data.Text (Text)
58 import qualified Data.Text as T
59 import qualified Data.Text.Encoding as TE
60 import qualified Data.Streaming.Text as Stream
61 import Data.Streaming.Text (DecodeResult(..))
62 import Control.Monad (join, liftM)
63 import Pipes
64
65
66
67 {- $usage
68 Encoding is of course simple. Given
69
70 > text :: Producer Text IO ()
71
72 we can encode it with @Data.Text.Encoding.encodeUtf8@
73
74 > TE.encodeUtf8 :: Text -> ByteString
75
76 and ordinary pipe operations:
77
78 > text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
79
80 or, equivalently
81
82 > for text (yield . TE.encodeUtf8)
83
84 But, using this module, we might use
85
86 > encodeUtf8 :: Text -> Producer ByteString m ()
87
88 to write
89
90 > for text encodeUtf8 :: Producer.ByteString IO ()
91
92 All of the above come to the same.
93
94
95 Given
96
97 > bytes :: Producer ByteString IO ()
98
99 we can apply a decoding function from this module:
100
101 > decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
102
103 The Text producer ends wherever decoding first fails. The un-decoded
104 material is returned. If we are confident it is of no interest, we can
105 write:
106
107 > void $ decodeUtf8 bytes :: Producer Text IO ()
108
109 Thus we can re-encode
110 as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
111
112 > for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
113
114 The bytestring producer that is returned begins with where utf16BE decoding
115 failed; if it didn't fail the producer is empty.
116
117 -}
118
119 {- $lenses
120 We get a bit more flexibility, particularly in the use of pipes-style "parsers",
121 if we use a lens like @utf8@ or @utf16BE@
122 that focusses on the text in an appropriately encoded byte stream.
123
124 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
125
126 is just an alias for a Prelude type. We abbreviate this further, for our use case, as
127
128 > type Codec
129 > = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
130
131 and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
132 re-encode what they have decoded. Thus you use any particular codec with
133 the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
134 <http://hackage.haskell.org/package/lens lens>,
135 <http://hackage.haskell.org/package/lens-family lens-family>,
136 <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the
137 and <http://hackage.haskell.org/package/microlens microlens> packages will all work
138 the same, since we already have access to the types they require.
139
140 Each decoding lens looks into a byte stream that is supposed to contain text.
141 The particular lenses are named in accordance with the expected
142 encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
143 into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
144
145 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
146 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
147 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
148
149 Of course, we could always do this with the specialized decoding functions, e.g.
150
151 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
152 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
153
154 As with these functions, the stream of text that a @Codec@ \'sees\'
155 in the stream of bytes begins at its head.
156 At any point of decoding failure, the stream of text ends and reverts to (returns)
157 the original byte stream. Thus if the first bytes are already
158 un-decodable, the whole ByteString producer will be returned, i.e.
159
160 > view utf8 bad_bytestream
161
162 will just come to the same as
163
164 > return bad_bytestream
165
166 Where there is no decoding failure, the return value of the text stream will be
167 an empty byte stream followed by its own return value. In all cases you must
168 deal with the fact that it is a /ByteString producer/ that is returned, even if
169 it can be thrown away with @Control.Monad.void@
170
171 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
172
173 The @eof@ lens permits you to pattern match: if there is a Right value,
174 it is the leftover bytestring producer, if there is a Right value, it
175 is the return value of the original bytestring producer:
176
177 > Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
178
179 Thus for the stream of un-decodable bytes mentioned above,
180
181 > view (utf8 . eof) bad_bytestream
182
183 will be the same as
184
185 > return (Left bad_bytestream)
186
187 @zoom utf8@ converts a Text parser into a ByteString parser:
188
189 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
190
191 or, using the type synonymn from @Pipes.Parse@:
192
193 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
194
195 Thus we can define a ByteString parser (in the pipes-parse sense) like this:
196
197 > charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
198 > charPlusByte = do char_ <- zoom utf8 Text.drawChar
199 > byte_ <- Bytes.peekByte
200 > return (char_, byte_)
201
202 Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
203 but it is a ByteString parser; it will return the first valid utf8-encoded
204 Char in a ByteString, /whatever its byte-length/,
205 and the first byte following, if both exist. Because
206 we \'draw\' one and \'peek\' at the other, the parser as a whole only
207 advances one Char's length along the bytestring, whatever that length may be.
208 See the slightly more complex example \'decode.hs\' in the
209 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
210 discussion of this type of byte stream parsing.
211 -}
212
213 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
214
215 type Codec
216 = forall m r
217 . Monad m
218 => Lens' (Producer ByteString m r)
219 (Producer Text m (Producer ByteString m r))
220
221
222 {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
223 exported here under a name appropriate to the material.
224 Thus given a bytestring producer called @bytes@ we have
225
226 > decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
227
228 All of these are thus the same:
229
230 > decode utf8 bytes
231 > view utf8 bytes
232 > bytes ^. utf8
233 > decodeUtf8 bytes
234
235 -}
236
237 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
238 decode codec a = getConstant (codec Constant a)
239
240 {- | @eof@ tells you explicitly when decoding stops due to bad bytes or
241 instead reaches end-of-file happily. (Without it one just makes an explicit
242 test for emptiness of the resulting bytestring production using next) Thus
243
244 > decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
245
246 If we hit undecodable bytes, the remaining bytestring producer will be
247 returned as a Left value; in the happy case, a Right value is returned
248 with the anticipated return value for the original bytestring producer.
249
250 Given a bytestring producer called @bytes@ all of these will be the same:
251
252 > decode (utf8 . eof) bytes
253 > view (utf8 . eof) bytes
254 > bytes^.utf8.eof
255
256 -}
257
258 eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r))
259 (t m (Either (Producer ByteString m r) r))
260 eof k p0 = fmap fromEither (k (toEither p0)) where
261
262 fromEither = liftM (either id return)
263
264 toEither pp = do p <- pp
265 check p
266
267 check p = do e <- lift (next p)
268 case e of
269 Left r -> return (Right r)
270 Right (bs,pb) -> if B.null bs
271 then check pb
272 else return (Left (do yield bs
273 pb))
274
275 utf8 :: Codec
276 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
277
278 utf8Pure :: Codec
279 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
280
281 utf16LE :: Codec
282 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
283
284 utf16BE :: Codec
285 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
286
287 utf32LE :: Codec
288 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
289
290 utf32BE :: Codec
291 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
292
293 decodeStream :: Monad m
294 => (B.ByteString -> DecodeResult)
295 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
296 decodeStream = loop where
297 loop dec0 p =
298 do x <- lift (next p)
299 case x of
300 Left r -> return (return r)
301 Right (chunk, p') -> case dec0 chunk of
302 DecodeResultSuccess text dec -> do yield text
303 loop dec p'
304 DecodeResultFailure text bs -> do yield text
305 return (do yield bs
306 p')
307 {-# INLINABLE decodeStream#-}
308
309
310 {- $decoders
311 These are functions with the simple type:
312
313 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
314
315 Thus in general
316
317 > decodeUtf8 = view utf8
318 > decodeUtf16LE = view utf16LE
319
320 and so forth, but these forms
321 may be more convenient (and give better type errors!) where lenses are
322 not desired.
323 -}
324
325
326 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
327 decodeUtf8 = decodeStream Stream.decodeUtf8
328 {-# INLINE decodeUtf8 #-}
329
330 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
331 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
332 {-# INLINE decodeUtf8Pure #-}
333
334 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
335 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
336 {-# INLINE decodeUtf16LE #-}
337
338 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
339 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
340 {-# INLINE decodeUtf16BE #-}
341
342 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
343 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
344 {-# INLINE decodeUtf32LE #-}
345
346 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
347 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
348 {-# INLINE decodeUtf32BE #-}
349
350
351 {- $encoders
352 These are simply defined
353
354 > encodeUtf8 = yield . TE.encodeUtf8
355
356 They are intended for use with 'for'
357
358 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
359
360 which would have the effect of
361
362 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
363
364 using the encoding functions from Data.Text.Encoding
365 -}
366
367 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
368 encodeUtf8 = yield . TE.encodeUtf8
369 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
370 encodeUtf16LE = yield . TE.encodeUtf16LE
371 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
372 encodeUtf16BE = yield . TE.encodeUtf16BE
373 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
374 encodeUtf32LE = yield . TE.encodeUtf32LE
375 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
376 encodeUtf32BE = yield . TE.encodeUtf32BE
377
378 mkCodec :: (forall r m . Monad m =>
379 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
380 -> (Text -> ByteString)
381 -> Codec
382 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
383
384
385
386 {- $ascii
387 ascii and latin encodings only use a small number of the characters 'Text'
388 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
389 Rather we simply define functions each way.
390 -}
391
392
393 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
394 -- returning the rest of the 'Text' at the first non-ascii 'Char'
395
396 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
397 encodeAscii = go where
398 go p = do e <- lift (next p)
399 case e of
400 Left r -> return (return r)
401 Right (chunk, p') ->
402 if T.null chunk
403 then go p'
404 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
405 in do yield (B8.pack (T.unpack safe))
406 if T.null unsafe
407 then go p'
408 else return $ do yield unsafe
409 p'
410
411 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
412 returning the rest of the 'Text' upon hitting any non-latin 'Char'
413 -}
414 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
415 encodeIso8859_1 = go where
416 go p = do e <- lift (next p)
417 case e of
418 Left r -> return (return r)
419 Right (txt, p') ->
420 if T.null txt
421 then go p'
422 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
423 in do yield (B8.pack (T.unpack safe))
424 if T.null unsafe
425 then go p'
426 else return $ do yield unsafe
427 p'
428
429 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
430 unused 'ByteString' upon hitting an un-ascii byte.
431 -}
432 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
433 decodeAscii = go where
434 go p = do e <- lift (next p)
435 case e of
436 Left r -> return (return r)
437 Right (chunk, p') ->
438 if B.null chunk
439 then go p'
440 else let (safe, unsafe) = B.span (<= 0x7F) chunk
441 in do yield (T.pack (B8.unpack safe))
442 if B.null unsafe
443 then go p'
444 else return (do yield unsafe
445 p')
446
447 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
448 unused 'ByteString' upon hitting the rare un-latinizable byte.
449 -}
450 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
451 decodeIso8859_1 = go where
452 go p = do e <- lift (next p)
453 case e of
454 Left r -> return (return r)
455 Right (chunk, p') ->
456 if B.null chunk
457 then go p'
458 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
459 yield (T.pack (B8.unpack safe))
460 if B.null unsafe
461 then go p'
462 else return (do yield unsafe
463 p')
464
465
466