]>
Commit | Line | Data |
---|---|---|
bbdfd305 | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
89d80557 | 2 | |
b091cbeb | 3 | -- | This module uses the stream decoding functions from |
edd8726a | 4 | -- <http://hackage.haskell.org/package/streaming-commons streaming-commons> |
4ea59a8b | 5 | -- package to define decoding functions and lenses. The exported names |
b091cbeb | 6 | -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ |
bbdfd305 | 7 | |
8 | module Pipes.Text.Encoding | |
fafcbeb5 | 9 | ( |
edd8726a | 10 | -- * Decoding ByteStrings and Encoding Texts |
11 | -- ** Simple usage | |
12 | -- $usage | |
13 | ||
14 | -- ** Lens usage | |
fafcbeb5 | 15 | -- $lenses |
edd8726a | 16 | |
17 | ||
18 | -- * Basic lens operations | |
fafcbeb5 | 19 | Codec |
4ea59a8b | 20 | , decode |
6c2fffdc | 21 | , eof |
edd8726a | 22 | -- * Decoding lenses |
bbdfd305 | 23 | , utf8 |
24 | , utf8Pure | |
25 | , utf16LE | |
26 | , utf16BE | |
27 | , utf32LE | |
28 | , utf32BE | |
fafcbeb5 | 29 | -- * Non-lens decoding functions |
0ac0c414 | 30 | -- $decoders |
89d80557 | 31 | , decodeUtf8 |
32 | , decodeUtf8Pure | |
33 | , decodeUtf16LE | |
34 | , decodeUtf16BE | |
35 | , decodeUtf32LE | |
36 | , decodeUtf32BE | |
0ac0c414 | 37 | -- * Re-encoding functions |
38 | -- $encoders | |
39 | , encodeUtf8 | |
40 | , encodeUtf16LE | |
41 | , encodeUtf16BE | |
42 | , encodeUtf32LE | |
43 | , encodeUtf32BE | |
fafcbeb5 | 44 | -- * Functions for latin and ascii text |
45 | -- $ascii | |
bbdfd305 | 46 | , encodeAscii |
47 | , decodeAscii | |
48 | , encodeIso8859_1 | |
49 | , decodeIso8859_1 | |
50 | ) | |
51 | where | |
52 | ||
0ac0c414 | 53 | import Data.Functor.Constant (Constant(..)) |
bbdfd305 | 54 | import Data.Char (ord) |
55 | import Data.ByteString as B | |
bbdfd305 | 56 | import Data.ByteString.Char8 as B8 |
57 | import Data.Text (Text) | |
58 | import qualified Data.Text as T | |
59 | import qualified Data.Text.Encoding as TE | |
eae50557 | 60 | import qualified Data.Streaming.Text as Stream |
61 | import Data.Streaming.Text (DecodeResult(..)) | |
b091cbeb | 62 | import Control.Monad (join, liftM) |
bbdfd305 | 63 | import Pipes |
bbdfd305 | 64 | |
bbdfd305 | 65 | |
0ac0c414 | 66 | |
edd8726a | 67 | {- $usage |
4543ee0d | 68 | Encoding is of course simple. Given |
fafcbeb5 | 69 | |
edd8726a | 70 | > text :: Producer Text IO () |
d199072b | 71 | |
4543ee0d | 72 | we can encode it with @Data.Text.Encoding.encodeUtf8@ |
73 | ||
74 | > TE.encodeUtf8 :: Text -> ByteString | |
75 | ||
76 | and ordinary pipe operations: | |
0ac0c414 | 77 | |
edd8726a | 78 | > text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO () |
0ac0c414 | 79 | |
4543ee0d | 80 | or, equivalently |
81 | ||
82 | > for text (yield . TE.encodeUtf8) | |
83 | ||
84 | But, using this module, we might use | |
85 | ||
86 | > encodeUtf8 :: Text -> Producer ByteString m () | |
87 | ||
88 | to write | |
0ac0c414 | 89 | |
edd8726a | 90 | > for text encodeUtf8 :: Producer.ByteString IO () |
6c2fffdc | 91 | |
4543ee0d | 92 | All of the above come to the same. |
93 | ||
94 | ||
95 | Given | |
edd8726a | 96 | |
4543ee0d | 97 | > bytes :: Producer ByteString IO () |
0ac0c414 | 98 | |
edd8726a | 99 | we can apply a decoding function from this module: |
6c2fffdc | 100 | |
edd8726a | 101 | > decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ()) |
102 | ||
4543ee0d | 103 | The Text producer ends wherever decoding first fails. The un-decoded |
104 | material is returned. If we are confident it is of no interest, we can | |
105 | write: | |
106 | ||
107 | > void $ decodeUtf8 bytes :: Producer Text IO () | |
108 | ||
109 | Thus we can re-encode | |
edd8726a | 110 | as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g. |
111 | ||
112 | > for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ()) | |
113 | ||
114 | The bytestring producer that is returned begins with where utf16BE decoding | |
e8336ba6 | 115 | failed; if it didn't fail the producer is empty. |
6c2fffdc | 116 | |
117 | -} | |
118 | ||
edd8726a | 119 | {- $lenses |
4543ee0d | 120 | We get a bit more flexibility, particularly in the use of pipes-style "parsers", |
121 | if we use a lens like @utf8@ or @utf16BE@ | |
122 | that focusses on the text in an appropriately encoded byte stream. | |
6c2fffdc | 123 | |
edd8726a | 124 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
6c2fffdc | 125 | |
4543ee0d | 126 | is just an alias for a Prelude type. We abbreviate this further, for our use case, as |
6c2fffdc | 127 | |
edd8726a | 128 | > type Codec |
129 | > = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) | |
6c2fffdc | 130 | |
edd8726a | 131 | and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can |
132 | re-encode what they have decoded. Thus you use any particular codec with | |
133 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries; | |
4543ee0d | 134 | <http://hackage.haskell.org/package/lens lens>, |
135 | <http://hackage.haskell.org/package/lens-family lens-family>, | |
136 | <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the | |
137 | and <http://hackage.haskell.org/package/microlens microlens> packages will all work | |
138 | the same, since we already have access to the types they require. | |
bbdfd305 | 139 | |
edd8726a | 140 | Each decoding lens looks into a byte stream that is supposed to contain text. |
141 | The particular lenses are named in accordance with the expected | |
142 | encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@ | |
143 | into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode': | |
4ea59a8b | 144 | |
145 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
146 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
147 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | |
148 | ||
4543ee0d | 149 | Of course, we could always do this with the specialized decoding functions, e.g. |
4ea59a8b | 150 | |
151 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
152 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
153 | ||
edd8726a | 154 | As with these functions, the stream of text that a @Codec@ \'sees\' |
155 | in the stream of bytes begins at its head. | |
4ea59a8b | 156 | At any point of decoding failure, the stream of text ends and reverts to (returns) |
157 | the original byte stream. Thus if the first bytes are already | |
158 | un-decodable, the whole ByteString producer will be returned, i.e. | |
159 | ||
edd8726a | 160 | > view utf8 bad_bytestream |
4ea59a8b | 161 | |
162 | will just come to the same as | |
163 | ||
edd8726a | 164 | > return bad_bytestream |
4ea59a8b | 165 | |
166 | Where there is no decoding failure, the return value of the text stream will be | |
167 | an empty byte stream followed by its own return value. In all cases you must | |
168 | deal with the fact that it is a /ByteString producer/ that is returned, even if | |
169 | it can be thrown away with @Control.Monad.void@ | |
170 | ||
171 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | |
edd8726a | 172 | |
173 | The @eof@ lens permits you to pattern match: if there is a Right value, | |
174 | it is the leftover bytestring producer, if there is a Right value, it | |
175 | is the return value of the original bytestring producer: | |
176 | ||
177 | > Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ()) | |
4ea59a8b | 178 | |
edd8726a | 179 | Thus for the stream of un-decodable bytes mentioned above, |
180 | ||
181 | > view (utf8 . eof) bad_bytestream | |
182 | ||
183 | will be the same as | |
184 | ||
185 | > return (Left bad_bytestream) | |
186 | ||
4543ee0d | 187 | @zoom utf8@ converts a Text parser into a ByteString parser: |
4ea59a8b | 188 | |
189 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | |
190 | ||
a4913c42 | 191 | or, using the type synonymn from @Pipes.Parse@: |
4ea59a8b | 192 | |
193 | > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) | |
194 | ||
edd8726a | 195 | Thus we can define a ByteString parser (in the pipes-parse sense) like this: |
4ea59a8b | 196 | |
edd8726a | 197 | > charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8))) |
198 | > charPlusByte = do char_ <- zoom utf8 Text.drawChar | |
4ea59a8b | 199 | > byte_ <- Bytes.peekByte |
200 | > return (char_, byte_) | |
201 | ||
edd8726a | 202 | Though @charPlusByte@ is partly defined with a Text parser 'drawChar'; |
4ea59a8b | 203 | but it is a ByteString parser; it will return the first valid utf8-encoded |
4543ee0d | 204 | Char in a ByteString, /whatever its byte-length/, |
edd8726a | 205 | and the first byte following, if both exist. Because |
4ea59a8b | 206 | we \'draw\' one and \'peek\' at the other, the parser as a whole only |
207 | advances one Char's length along the bytestring, whatever that length may be. | |
208 | See the slightly more complex example \'decode.hs\' in the | |
edd8726a | 209 | <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog> |
210 | discussion of this type of byte stream parsing. | |
4ea59a8b | 211 | -} |
212 | ||
edd8726a | 213 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
214 | ||
215 | type Codec | |
216 | = forall m r | |
217 | . Monad m | |
218 | => Lens' (Producer ByteString m r) | |
219 | (Producer Text m (Producer ByteString m r)) | |
220 | ||
221 | ||
222 | {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries; | |
a31032bb | 223 | exported here under a name appropriate to the material. |
224 | Thus given a bytestring producer called @bytes@ we have | |
edd8726a | 225 | |
226 | > decode utf8 bytes :: Producer Text IO (Producer ByteString IO ()) | |
227 | ||
228 | All of these are thus the same: | |
229 | ||
a31032bb | 230 | > decode utf8 bytes |
231 | > view utf8 bytes | |
232 | > bytes ^. utf8 | |
233 | > decodeUtf8 bytes | |
edd8726a | 234 | |
235 | -} | |
236 | ||
237 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b | |
238 | decode codec a = getConstant (codec Constant a) | |
239 | ||
240 | {- | @eof@ tells you explicitly when decoding stops due to bad bytes or | |
241 | instead reaches end-of-file happily. (Without it one just makes an explicit | |
242 | test for emptiness of the resulting bytestring production using next) Thus | |
243 | ||
244 | > decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ()) | |
245 | ||
246 | If we hit undecodable bytes, the remaining bytestring producer will be | |
247 | returned as a Left value; in the happy case, a Right value is returned | |
248 | with the anticipated return value for the original bytestring producer. | |
249 | ||
a31032bb | 250 | Given a bytestring producer called @bytes@ all of these will be the same: |
edd8726a | 251 | |
a31032bb | 252 | > decode (utf8 . eof) bytes |
253 | > view (utf8 . eof) bytes | |
254 | > bytes^.utf8.eof | |
edd8726a | 255 | |
256 | -} | |
257 | ||
4543ee0d | 258 | eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r)) |
259 | (t m (Either (Producer ByteString m r) r)) | |
e8336ba6 | 260 | eof k p0 = fmap fromEither (k (toEither p0)) where |
edd8726a | 261 | |
262 | fromEither = liftM (either id return) | |
263 | ||
264 | toEither pp = do p <- pp | |
265 | check p | |
266 | ||
267 | check p = do e <- lift (next p) | |
268 | case e of | |
269 | Left r -> return (Right r) | |
270 | Right (bs,pb) -> if B.null bs | |
271 | then check pb | |
272 | else return (Left (do yield bs | |
273 | pb)) | |
274 | ||
4ea59a8b | 275 | utf8 :: Codec |
276 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | |
277 | ||
278 | utf8Pure :: Codec | |
279 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | |
280 | ||
281 | utf16LE :: Codec | |
282 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | |
283 | ||
284 | utf16BE :: Codec | |
285 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | |
286 | ||
287 | utf32LE :: Codec | |
288 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | |
289 | ||
290 | utf32BE :: Codec | |
291 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |
292 | ||
bbdfd305 | 293 | decodeStream :: Monad m |
294 | => (B.ByteString -> DecodeResult) | |
295 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
296 | decodeStream = loop where | |
297 | loop dec0 p = | |
298 | do x <- lift (next p) | |
e8336ba6 | 299 | case x of |
300 | Left r -> return (return r) | |
301 | Right (chunk, p') -> case dec0 chunk of | |
302 | DecodeResultSuccess text dec -> do yield text | |
303 | loop dec p' | |
304 | DecodeResultFailure text bs -> do yield text | |
305 | return (do yield bs | |
306 | p') | |
bbdfd305 | 307 | {-# INLINABLE decodeStream#-} |
308 | ||
6c2fffdc | 309 | |
0ac0c414 | 310 | {- $decoders |
311 | These are functions with the simple type: | |
312 | ||
313 | > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
314 | ||
315 | Thus in general | |
316 | ||
317 | > decodeUtf8 = view utf8 | |
318 | > decodeUtf16LE = view utf16LE | |
fafcbeb5 | 319 | |
0ac0c414 | 320 | and so forth, but these forms |
321 | may be more convenient (and give better type errors!) where lenses are | |
322 | not desired. | |
323 | -} | |
fafcbeb5 | 324 | |
325 | ||
bbdfd305 | 326 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
eae50557 | 327 | decodeUtf8 = decodeStream Stream.decodeUtf8 |
bbdfd305 | 328 | {-# INLINE decodeUtf8 #-} |
329 | ||
330 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 331 | decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure |
bbdfd305 | 332 | {-# INLINE decodeUtf8Pure #-} |
333 | ||
334 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 335 | decodeUtf16LE = decodeStream Stream.decodeUtf16LE |
bbdfd305 | 336 | {-# INLINE decodeUtf16LE #-} |
337 | ||
338 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 339 | decodeUtf16BE = decodeStream Stream.decodeUtf16BE |
bbdfd305 | 340 | {-# INLINE decodeUtf16BE #-} |
341 | ||
342 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 343 | decodeUtf32LE = decodeStream Stream.decodeUtf32LE |
bbdfd305 | 344 | {-# INLINE decodeUtf32LE #-} |
345 | ||
346 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 347 | decodeUtf32BE = decodeStream Stream.decodeUtf32BE |
bbdfd305 | 348 | {-# INLINE decodeUtf32BE #-} |
349 | ||
0ac0c414 | 350 | |
351 | {- $encoders | |
352 | These are simply defined | |
353 | ||
354 | > encodeUtf8 = yield . TE.encodeUtf8 | |
355 | ||
356 | They are intended for use with 'for' | |
357 | ||
358 | > for Text.stdin encodeUtf8 :: Producer ByteString IO () | |
359 | ||
360 | which would have the effect of | |
361 | ||
362 | > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8) | |
363 | ||
364 | using the encoding functions from Data.Text.Encoding | |
365 | -} | |
366 | ||
10cfd90e | 367 | encodeUtf8 :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 368 | encodeUtf8 = yield . TE.encodeUtf8 |
10cfd90e | 369 | encodeUtf16LE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 370 | encodeUtf16LE = yield . TE.encodeUtf16LE |
10cfd90e | 371 | encodeUtf16BE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 372 | encodeUtf16BE = yield . TE.encodeUtf16BE |
10cfd90e | 373 | encodeUtf32LE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 374 | encodeUtf32LE = yield . TE.encodeUtf32LE |
10cfd90e | 375 | encodeUtf32BE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 376 | encodeUtf32BE = yield . TE.encodeUtf32BE |
377 | ||
bbdfd305 | 378 | mkCodec :: (forall r m . Monad m => |
379 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | |
380 | -> (Text -> ByteString) | |
381 | -> Codec | |
382 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |
383 | ||
384 | ||
bbdfd305 | 385 | |
fafcbeb5 | 386 | {- $ascii |
387 | ascii and latin encodings only use a small number of the characters 'Text' | |
388 | recognizes; thus we cannot use the pipes @Lens@ style to work with them. | |
bbdfd305 | 389 | Rather we simply define functions each way. |
bbdfd305 | 390 | -} |
391 | ||
fafcbeb5 | 392 | |
0ac0c414 | 393 | -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, |
fafcbeb5 | 394 | -- returning the rest of the 'Text' at the first non-ascii 'Char' |
395 | ||
bbdfd305 | 396 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
397 | encodeAscii = go where | |
398 | go p = do e <- lift (next p) | |
399 | case e of | |
400 | Left r -> return (return r) | |
401 | Right (chunk, p') -> | |
402 | if T.null chunk | |
403 | then go p' | |
404 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
405 | in do yield (B8.pack (T.unpack safe)) | |
406 | if T.null unsafe | |
407 | then go p' | |
408 | else return $ do yield unsafe | |
409 | p' | |
410 | ||
411 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | |
412 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | |
413 | -} | |
414 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
415 | encodeIso8859_1 = go where | |
416 | go p = do e <- lift (next p) | |
417 | case e of | |
418 | Left r -> return (return r) | |
419 | Right (txt, p') -> | |
420 | if T.null txt | |
421 | then go p' | |
422 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | |
423 | in do yield (B8.pack (T.unpack safe)) | |
424 | if T.null unsafe | |
425 | then go p' | |
426 | else return $ do yield unsafe | |
427 | p' | |
428 | ||
429 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
430 | unused 'ByteString' upon hitting an un-ascii byte. | |
431 | -} | |
432 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
433 | decodeAscii = go where | |
434 | go p = do e <- lift (next p) | |
435 | case e of | |
436 | Left r -> return (return r) | |
437 | Right (chunk, p') -> | |
438 | if B.null chunk | |
439 | then go p' | |
440 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | |
441 | in do yield (T.pack (B8.unpack safe)) | |
442 | if B.null unsafe | |
443 | then go p' | |
444 | else return (do yield unsafe | |
445 | p') | |
446 | ||
447 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
448 | unused 'ByteString' upon hitting the rare un-latinizable byte. | |
449 | -} | |
450 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
451 | decodeIso8859_1 = go where | |
452 | go p = do e <- lift (next p) | |
453 | case e of | |
454 | Left r -> return (return r) | |
455 | Right (chunk, p') -> | |
456 | if B.null chunk | |
457 | then go p' | |
458 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | |
459 | yield (T.pack (B8.unpack safe)) | |
460 | if B.null unsafe | |
461 | then go p' | |
462 | else return (do yield unsafe | |
463 | p') | |
464 | ||
465 | ||
466 |