]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
travis
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2
3 -- | This module uses the stream decoding functions from
4 -- <http://hackage.haskell.org/package/streaming-commons streaming-commons>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
7
8 module Pipes.Text.Encoding
9 (
10 -- * Decoding ByteStrings and Encoding Texts
11 -- ** Simple usage
12 -- $usage
13
14 -- ** Lens usage
15 -- $lenses
16
17
18 -- * Basic lens operations
19 Codec
20 , decode
21 , eof
22 -- * Decoding lenses
23 , utf8
24 , utf8Pure
25 , utf16LE
26 , utf16BE
27 , utf32LE
28 , utf32BE
29 -- * Non-lens decoding functions
30 -- $decoders
31 , decodeUtf8
32 , decodeUtf8Pure
33 , decodeUtf16LE
34 , decodeUtf16BE
35 , decodeUtf32LE
36 , decodeUtf32BE
37 -- * Re-encoding functions
38 -- $encoders
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
44 -- * Functions for latin and ascii text
45 -- $ascii
46 , encodeAscii
47 , decodeAscii
48 , encodeIso8859_1
49 , decodeIso8859_1
50 )
51 where
52
53 import Data.Functor.Constant (Constant(..))
54 import Data.Char (ord)
55 import Data.ByteString as B
56 import Data.ByteString.Char8 as B8
57 import Data.Text (Text)
58 import qualified Data.Text as T
59 import qualified Data.Text.Encoding as TE
60 import qualified Data.Streaming.Text as Stream
61 import Data.Streaming.Text (DecodeResult(..))
62 import Control.Monad (join, liftM)
63 import Pipes
64
65
66
67 {- $usage
68 Encoding is of course simple. Given
69
70 > text :: Producer Text IO ()
71
72 we can encode it with @Data.Text.Encoding.encodeUtf8@
73
74 > TE.encodeUtf8 :: Text -> ByteString
75
76 and ordinary pipe operations:
77
78 > text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
79
80 or, equivalently
81
82 > for text (yield . TE.encodeUtf8)
83
84 But, using this module, we might use
85
86 > encodeUtf8 :: Text -> Producer ByteString m ()
87
88 to write
89
90 > for text encodeUtf8 :: Producer.ByteString IO ()
91
92 All of the above come to the same.
93
94
95 Given
96
97 > bytes :: Producer ByteString IO ()
98
99 we can apply a decoding function from this module:
100
101 > decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
102
103 The Text producer ends wherever decoding first fails. The un-decoded
104 material is returned. If we are confident it is of no interest, we can
105 write:
106
107 > void $ decodeUtf8 bytes :: Producer Text IO ()
108
109 Thus we can re-encode
110 as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
111
112 > for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
113
114 The bytestring producer that is returned begins with where utf16BE decoding
115 failed; if it didn't fail the producer is empty.
116
117 -}
118
119 {- $lenses
120 We get a bit more flexibility, particularly in the use of pipes-style "parsers",
121 if we use a lens like @utf8@ or @utf16BE@
122 that focusses on the text in an appropriately encoded byte stream.
123
124 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
125
126 is just an alias for a Prelude type. We abbreviate this further, for our use case, as
127
128 > type Codec
129 > = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
130
131 and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
132 re-encode what they have decoded. Thus you use any particular codec with
133 the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
134 <http://hackage.haskell.org/package/lens lens>,
135 <http://hackage.haskell.org/package/lens-family lens-family>,
136 <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the
137 and <http://hackage.haskell.org/package/microlens microlens> packages will all work
138 the same, since we already have access to the types they require.
139
140 Each decoding lens looks into a byte stream that is supposed to contain text.
141 The particular lenses are named in accordance with the expected
142 encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
143 into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
144
145 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
146 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
147 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
148
149 Of course, we could always do this with the specialized decoding functions, e.g.
150
151 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
152 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
153
154 As with these functions, the stream of text that a @Codec@ \'sees\'
155 in the stream of bytes begins at its head.
156 At any point of decoding failure, the stream of text ends and reverts to (returns)
157 the original byte stream. Thus if the first bytes are already
158 un-decodable, the whole ByteString producer will be returned, i.e.
159
160 > view utf8 bad_bytestream
161
162 will just come to the same as
163
164 > return bad_bytestream
165
166 Where there is no decoding failure, the return value of the text stream will be
167 an empty byte stream followed by its own return value. In all cases you must
168 deal with the fact that it is a /ByteString producer/ that is returned, even if
169 it can be thrown away with @Control.Monad.void@
170
171 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
172
173 The @eof@ lens permits you to pattern match: if there is a Right value,
174 it is the leftover bytestring producer, if there is a Right value, it
175 is the return value of the original bytestring producer:
176
177 > Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
178
179 Thus for the stream of un-decodable bytes mentioned above,
180
181 > view (utf8 . eof) bad_bytestream
182
183 will be the same as
184
185 > return (Left bad_bytestream)
186
187 @zoom utf8@ converts a Text parser into a ByteString parser:
188
189 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
190
191 or, using the type synonymn from @Pipes.Parse@:
192
193 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
194
195 Thus we can define a ByteString parser (in the pipes-parse sense) like this:
196
197 > charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
198 > charPlusByte = do char_ <- zoom utf8 Text.drawChar
199 > byte_ <- Bytes.peekByte
200 > return (char_, byte_)
201
202 Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
203 but it is a ByteString parser; it will return the first valid utf8-encoded
204 Char in a ByteString, /whatever its byte-length/,
205 and the first byte following, if both exist. Because
206 we \'draw\' one and \'peek\' at the other, the parser as a whole only
207 advances one Char's length along the bytestring, whatever that length may be.
208 See the slightly more complex example \'decode.hs\' in the
209 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
210 discussion of this type of byte stream parsing.
211 -}
212
213 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
214
215 type Codec
216 = forall m r
217 . Monad m
218 => Lens' (Producer ByteString m r)
219 (Producer Text m (Producer ByteString m r))
220
221
222 {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
223 exported here under a name appropriate to the material. Thus
224
225 > decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
226
227 All of these are thus the same:
228
229 > decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
230
231
232 -}
233
234 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
235 decode codec a = getConstant (codec Constant a)
236
237 {- | @eof@ tells you explicitly when decoding stops due to bad bytes or
238 instead reaches end-of-file happily. (Without it one just makes an explicit
239 test for emptiness of the resulting bytestring production using next) Thus
240
241 > decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
242
243 If we hit undecodable bytes, the remaining bytestring producer will be
244 returned as a Left value; in the happy case, a Right value is returned
245 with the anticipated return value for the original bytestring producer.
246
247 Again, all of these are the same
248
249 > decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
250
251 -}
252
253 eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r))
254 (t m (Either (Producer ByteString m r) r))
255 eof k p0 = fmap fromEither (k (toEither p0)) where
256
257 fromEither = liftM (either id return)
258
259 toEither pp = do p <- pp
260 check p
261
262 check p = do e <- lift (next p)
263 case e of
264 Left r -> return (Right r)
265 Right (bs,pb) -> if B.null bs
266 then check pb
267 else return (Left (do yield bs
268 pb))
269
270 utf8 :: Codec
271 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
272
273 utf8Pure :: Codec
274 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
275
276 utf16LE :: Codec
277 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
278
279 utf16BE :: Codec
280 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
281
282 utf32LE :: Codec
283 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
284
285 utf32BE :: Codec
286 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
287
288 decodeStream :: Monad m
289 => (B.ByteString -> DecodeResult)
290 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
291 decodeStream = loop where
292 loop dec0 p =
293 do x <- lift (next p)
294 case x of
295 Left r -> return (return r)
296 Right (chunk, p') -> case dec0 chunk of
297 DecodeResultSuccess text dec -> do yield text
298 loop dec p'
299 DecodeResultFailure text bs -> do yield text
300 return (do yield bs
301 p')
302 {-# INLINABLE decodeStream#-}
303
304
305 {- $decoders
306 These are functions with the simple type:
307
308 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
309
310 Thus in general
311
312 > decodeUtf8 = view utf8
313 > decodeUtf16LE = view utf16LE
314
315 and so forth, but these forms
316 may be more convenient (and give better type errors!) where lenses are
317 not desired.
318 -}
319
320
321 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
322 decodeUtf8 = decodeStream Stream.decodeUtf8
323 {-# INLINE decodeUtf8 #-}
324
325 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
326 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
327 {-# INLINE decodeUtf8Pure #-}
328
329 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
330 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
331 {-# INLINE decodeUtf16LE #-}
332
333 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
334 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
335 {-# INLINE decodeUtf16BE #-}
336
337 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
338 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
339 {-# INLINE decodeUtf32LE #-}
340
341 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
342 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
343 {-# INLINE decodeUtf32BE #-}
344
345
346 {- $encoders
347 These are simply defined
348
349 > encodeUtf8 = yield . TE.encodeUtf8
350
351 They are intended for use with 'for'
352
353 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
354
355 which would have the effect of
356
357 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
358
359 using the encoding functions from Data.Text.Encoding
360 -}
361
362 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
363 encodeUtf8 = yield . TE.encodeUtf8
364 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
365 encodeUtf16LE = yield . TE.encodeUtf16LE
366 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
367 encodeUtf16BE = yield . TE.encodeUtf16BE
368 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
369 encodeUtf32LE = yield . TE.encodeUtf32LE
370 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
371 encodeUtf32BE = yield . TE.encodeUtf32BE
372
373 mkCodec :: (forall r m . Monad m =>
374 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
375 -> (Text -> ByteString)
376 -> Codec
377 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
378
379
380
381 {- $ascii
382 ascii and latin encodings only use a small number of the characters 'Text'
383 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
384 Rather we simply define functions each way.
385 -}
386
387
388 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
389 -- returning the rest of the 'Text' at the first non-ascii 'Char'
390
391 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
392 encodeAscii = go where
393 go p = do e <- lift (next p)
394 case e of
395 Left r -> return (return r)
396 Right (chunk, p') ->
397 if T.null chunk
398 then go p'
399 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
400 in do yield (B8.pack (T.unpack safe))
401 if T.null unsafe
402 then go p'
403 else return $ do yield unsafe
404 p'
405
406 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
407 returning the rest of the 'Text' upon hitting any non-latin 'Char'
408 -}
409 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
410 encodeIso8859_1 = go where
411 go p = do e <- lift (next p)
412 case e of
413 Left r -> return (return r)
414 Right (txt, p') ->
415 if T.null txt
416 then go p'
417 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
418 in do yield (B8.pack (T.unpack safe))
419 if T.null unsafe
420 then go p'
421 else return $ do yield unsafe
422 p'
423
424 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
425 unused 'ByteString' upon hitting an un-ascii byte.
426 -}
427 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
428 decodeAscii = go where
429 go p = do e <- lift (next p)
430 case e of
431 Left r -> return (return r)
432 Right (chunk, p') ->
433 if B.null chunk
434 then go p'
435 else let (safe, unsafe) = B.span (<= 0x7F) chunk
436 in do yield (T.pack (B8.unpack safe))
437 if B.null unsafe
438 then go p'
439 else return (do yield unsafe
440 p')
441
442 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
443 unused 'ByteString' upon hitting the rare un-latinizable byte.
444 -}
445 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
446 decodeIso8859_1 = go where
447 go p = do e <- lift (next p)
448 case e of
449 Left r -> return (return r)
450 Right (chunk, p') ->
451 if B.null chunk
452 then go p'
453 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
454 yield (T.pack (B8.unpack safe))
455 if B.null unsafe
456 then go p'
457 else return (do yield unsafe
458 p')
459
460
461