]>
Commit | Line | Data |
---|---|---|
bbdfd305 | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
89d80557 | 2 | |
0ac0c414 | 3 | -- | This module uses the stream decoding functions from Michael Snoyman's new |
4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> | |
4ea59a8b | 5 | -- package to define decoding functions and lenses. The exported names |
6 | -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be | |
7 | -- imported unqualified. | |
bbdfd305 | 8 | |
9 | module Pipes.Text.Encoding | |
fafcbeb5 | 10 | ( |
0ac0c414 | 11 | -- * The Lens or Codec type |
fafcbeb5 | 12 | -- $lenses |
13 | Codec | |
4ea59a8b | 14 | , decode |
6c2fffdc | 15 | , eof |
a4913c42 | 16 | -- * \'Viewing\' the Text in a byte stream |
fafcbeb5 | 17 | -- $codecs |
bbdfd305 | 18 | , utf8 |
19 | , utf8Pure | |
20 | , utf16LE | |
21 | , utf16BE | |
22 | , utf32LE | |
23 | , utf32BE | |
fafcbeb5 | 24 | -- * Non-lens decoding functions |
0ac0c414 | 25 | -- $decoders |
89d80557 | 26 | , decodeUtf8 |
27 | , decodeUtf8Pure | |
28 | , decodeUtf16LE | |
29 | , decodeUtf16BE | |
30 | , decodeUtf32LE | |
31 | , decodeUtf32BE | |
0ac0c414 | 32 | -- * Re-encoding functions |
33 | -- $encoders | |
34 | , encodeUtf8 | |
35 | , encodeUtf16LE | |
36 | , encodeUtf16BE | |
37 | , encodeUtf32LE | |
38 | , encodeUtf32BE | |
fafcbeb5 | 39 | -- * Functions for latin and ascii text |
40 | -- $ascii | |
bbdfd305 | 41 | , encodeAscii |
42 | , decodeAscii | |
43 | , encodeIso8859_1 | |
44 | , decodeIso8859_1 | |
45 | ) | |
46 | where | |
47 | ||
0ac0c414 | 48 | import Data.Functor.Constant (Constant(..)) |
bbdfd305 | 49 | import Data.Char (ord) |
50 | import Data.ByteString as B | |
51 | import Data.ByteString (ByteString) | |
bbdfd305 | 52 | import Data.ByteString.Char8 as B8 |
53 | import Data.Text (Text) | |
54 | import qualified Data.Text as T | |
55 | import qualified Data.Text.Encoding as TE | |
eae50557 | 56 | import qualified Data.Streaming.Text as Stream |
57 | import Data.Streaming.Text (DecodeResult(..)) | |
70125641 | 58 | import Control.Monad (join) |
89d80557 | 59 | import Data.Word (Word8) |
bbdfd305 | 60 | import Pipes |
bbdfd305 | 61 | |
2f4a83f8 | 62 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
bbdfd305 | 63 | |
fafcbeb5 | 64 | {- $lenses |
0ac0c414 | 65 | The 'Codec' type is a simple specializion of |
2f4a83f8 | 66 | the @Lens'@ type synonymn used by the standard lens libraries, |
0ac0c414 | 67 | <http://hackage.haskell.org/package/lens lens> and |
68 | <http://hackage.haskell.org/package/lens-family lens-family>. That type, | |
fafcbeb5 | 69 | |
2f4a83f8 | 70 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
0ac0c414 | 71 | |
4ea59a8b | 72 | is just an alias for a Prelude type. Thus you use any particular codec with |
73 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries; | |
74 | we presuppose neither since we already have access to the types they require. | |
0ac0c414 | 75 | |
fafcbeb5 | 76 | -} |
77 | ||
21eb409c | 78 | type Codec |
d199072b | 79 | = forall m r |
21eb409c | 80 | . Monad m |
2f4a83f8 | 81 | => Lens' (Producer ByteString m r) |
d199072b | 82 | (Producer Text m (Producer ByteString m r)) |
83 | ||
0ac0c414 | 84 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; |
4ea59a8b | 85 | exported here under a name appropriate to the material. All of these are |
86 | the same: | |
0ac0c414 | 87 | |
4ea59a8b | 88 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8 |
0ac0c414 | 89 | |
90 | -} | |
91 | ||
6c2fffdc | 92 | |
0ac0c414 | 93 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b |
94 | decode codec a = getConstant (codec Constant a) | |
95 | ||
6c2fffdc | 96 | {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead |
97 | reaches end-of-file happily. (Without it one just makes an explicit test | |
98 | for emptiness of the resulting bytestring production using 'next') | |
99 | Thus | |
100 | ||
101 | > decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof | |
102 | ||
103 | will be a text producer. If we hit undecodable bytes, the remaining | |
104 | bytestring producer will be returned as a 'Left' value; | |
105 | in the happy case, a 'Right' value is returned with the anticipated | |
106 | return value for the original bytestring producer. | |
107 | ) | |
108 | ||
109 | -} | |
110 | ||
111 | eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r)) | |
112 | (Producer Text m (Either (Producer ByteString m r) r)) | |
113 | eof k p = fmap fromEither (k (toEither p)) where | |
114 | ||
115 | fromEither = liftM (either id return) | |
116 | ||
117 | toEither pp = do p <- pp | |
118 | check p | |
119 | ||
120 | check p = do e <- lift (next p) | |
121 | case e of | |
122 | Left r -> return (Right r) | |
123 | Right (bs,pb) -> if B.null bs | |
124 | then check pb | |
125 | else return (Left (do yield bs | |
126 | pb)) | |
127 | ||
bbdfd305 | 128 | |
4ea59a8b | 129 | {- $codecs |
130 | ||
131 | Each Codec-lens looks into a byte stream that is supposed to contain text. | |
132 | The particular \'Codec\' lenses are named in accordance with the expected | |
133 | encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function, | |
134 | use @view@ / @(^.)@ -- here also called 'decode': | |
135 | ||
136 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
137 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
138 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | |
139 | ||
140 | Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized | |
141 | decoding functions exported here, e.g. | |
142 | ||
143 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
144 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
145 | ||
146 | The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. | |
147 | At any point of decoding failure, the stream of text ends and reverts to (returns) | |
148 | the original byte stream. Thus if the first bytes are already | |
149 | un-decodable, the whole ByteString producer will be returned, i.e. | |
150 | ||
151 | > view utf8 bytestream | |
152 | ||
153 | will just come to the same as | |
154 | ||
155 | > return bytestream | |
156 | ||
157 | Where there is no decoding failure, the return value of the text stream will be | |
158 | an empty byte stream followed by its own return value. In all cases you must | |
159 | deal with the fact that it is a /ByteString producer/ that is returned, even if | |
160 | it can be thrown away with @Control.Monad.void@ | |
161 | ||
162 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | |
163 | ||
164 | @zoom@ converts a Text parser into a ByteString parser: | |
165 | ||
166 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | |
167 | ||
a4913c42 | 168 | or, using the type synonymn from @Pipes.Parse@: |
4ea59a8b | 169 | |
170 | > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) | |
171 | ||
a4913c42 | 172 | Thus we can define a ByteString parser like this: |
4ea59a8b | 173 | |
174 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | |
175 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | |
176 | > byte_ <- Bytes.peekByte | |
177 | > return (char_, byte_) | |
178 | ||
179 | Though @withNextByte@ is partly defined with a Text parser 'drawChar'; | |
180 | but it is a ByteString parser; it will return the first valid utf8-encoded | |
181 | Char in a ByteString, whatever its length, | |
182 | and the first byte of the next character, if they exist. Because | |
183 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | |
184 | advances one Char's length along the bytestring, whatever that length may be. | |
185 | See the slightly more complex example \'decode.hs\' in the | |
186 | <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> | |
187 | discussion of this type of byte stream parsing. | |
188 | -} | |
189 | ||
190 | utf8 :: Codec | |
191 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | |
192 | ||
193 | utf8Pure :: Codec | |
194 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | |
195 | ||
196 | utf16LE :: Codec | |
197 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | |
198 | ||
199 | utf16BE :: Codec | |
200 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | |
201 | ||
202 | utf32LE :: Codec | |
203 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | |
204 | ||
205 | utf32BE :: Codec | |
206 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |
207 | ||
bbdfd305 | 208 | decodeStream :: Monad m |
209 | => (B.ByteString -> DecodeResult) | |
210 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
211 | decodeStream = loop where | |
212 | loop dec0 p = | |
213 | do x <- lift (next p) | |
214 | case x of Left r -> return (return r) | |
215 | Right (chunk, p') -> case dec0 chunk of | |
216 | DecodeResultSuccess text dec -> do yield text | |
217 | loop dec p' | |
218 | DecodeResultFailure text bs -> do yield text | |
219 | return (do yield bs | |
220 | p') | |
221 | {-# INLINABLE decodeStream#-} | |
222 | ||
6c2fffdc | 223 | |
0ac0c414 | 224 | {- $decoders |
225 | These are functions with the simple type: | |
226 | ||
227 | > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
228 | ||
229 | Thus in general | |
230 | ||
231 | > decodeUtf8 = view utf8 | |
232 | > decodeUtf16LE = view utf16LE | |
fafcbeb5 | 233 | |
0ac0c414 | 234 | and so forth, but these forms |
235 | may be more convenient (and give better type errors!) where lenses are | |
236 | not desired. | |
237 | -} | |
fafcbeb5 | 238 | |
239 | ||
bbdfd305 | 240 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
eae50557 | 241 | decodeUtf8 = decodeStream Stream.decodeUtf8 |
bbdfd305 | 242 | {-# INLINE decodeUtf8 #-} |
243 | ||
244 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 245 | decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure |
bbdfd305 | 246 | {-# INLINE decodeUtf8Pure #-} |
247 | ||
248 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 249 | decodeUtf16LE = decodeStream Stream.decodeUtf16LE |
bbdfd305 | 250 | {-# INLINE decodeUtf16LE #-} |
251 | ||
252 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 253 | decodeUtf16BE = decodeStream Stream.decodeUtf16BE |
bbdfd305 | 254 | {-# INLINE decodeUtf16BE #-} |
255 | ||
256 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 257 | decodeUtf32LE = decodeStream Stream.decodeUtf32LE |
bbdfd305 | 258 | {-# INLINE decodeUtf32LE #-} |
259 | ||
260 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 261 | decodeUtf32BE = decodeStream Stream.decodeUtf32BE |
bbdfd305 | 262 | {-# INLINE decodeUtf32BE #-} |
263 | ||
0ac0c414 | 264 | |
265 | {- $encoders | |
266 | These are simply defined | |
267 | ||
268 | > encodeUtf8 = yield . TE.encodeUtf8 | |
269 | ||
270 | They are intended for use with 'for' | |
271 | ||
272 | > for Text.stdin encodeUtf8 :: Producer ByteString IO () | |
273 | ||
274 | which would have the effect of | |
275 | ||
276 | > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8) | |
277 | ||
278 | using the encoding functions from Data.Text.Encoding | |
279 | -} | |
280 | ||
10cfd90e | 281 | encodeUtf8 :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 282 | encodeUtf8 = yield . TE.encodeUtf8 |
10cfd90e | 283 | encodeUtf16LE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 284 | encodeUtf16LE = yield . TE.encodeUtf16LE |
10cfd90e | 285 | encodeUtf16BE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 286 | encodeUtf16BE = yield . TE.encodeUtf16BE |
10cfd90e | 287 | encodeUtf32LE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 288 | encodeUtf32LE = yield . TE.encodeUtf32LE |
10cfd90e | 289 | encodeUtf32BE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 290 | encodeUtf32BE = yield . TE.encodeUtf32BE |
291 | ||
bbdfd305 | 292 | mkCodec :: (forall r m . Monad m => |
293 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | |
294 | -> (Text -> ByteString) | |
295 | -> Codec | |
296 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |
297 | ||
298 | ||
bbdfd305 | 299 | |
fafcbeb5 | 300 | {- $ascii |
301 | ascii and latin encodings only use a small number of the characters 'Text' | |
302 | recognizes; thus we cannot use the pipes @Lens@ style to work with them. | |
bbdfd305 | 303 | Rather we simply define functions each way. |
bbdfd305 | 304 | -} |
305 | ||
fafcbeb5 | 306 | |
0ac0c414 | 307 | -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, |
fafcbeb5 | 308 | -- returning the rest of the 'Text' at the first non-ascii 'Char' |
309 | ||
bbdfd305 | 310 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
311 | encodeAscii = go where | |
312 | go p = do e <- lift (next p) | |
313 | case e of | |
314 | Left r -> return (return r) | |
315 | Right (chunk, p') -> | |
316 | if T.null chunk | |
317 | then go p' | |
318 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
319 | in do yield (B8.pack (T.unpack safe)) | |
320 | if T.null unsafe | |
321 | then go p' | |
322 | else return $ do yield unsafe | |
323 | p' | |
324 | ||
325 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | |
326 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | |
327 | -} | |
328 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
329 | encodeIso8859_1 = go where | |
330 | go p = do e <- lift (next p) | |
331 | case e of | |
332 | Left r -> return (return r) | |
333 | Right (txt, p') -> | |
334 | if T.null txt | |
335 | then go p' | |
336 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | |
337 | in do yield (B8.pack (T.unpack safe)) | |
338 | if T.null unsafe | |
339 | then go p' | |
340 | else return $ do yield unsafe | |
341 | p' | |
342 | ||
343 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
344 | unused 'ByteString' upon hitting an un-ascii byte. | |
345 | -} | |
346 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
347 | decodeAscii = go where | |
348 | go p = do e <- lift (next p) | |
349 | case e of | |
350 | Left r -> return (return r) | |
351 | Right (chunk, p') -> | |
352 | if B.null chunk | |
353 | then go p' | |
354 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | |
355 | in do yield (T.pack (B8.unpack safe)) | |
356 | if B.null unsafe | |
357 | then go p' | |
358 | else return (do yield unsafe | |
359 | p') | |
360 | ||
361 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
362 | unused 'ByteString' upon hitting the rare un-latinizable byte. | |
363 | -} | |
364 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
365 | decodeIso8859_1 = go where | |
366 | go p = do e <- lift (next p) | |
367 | case e of | |
368 | Left r -> return (return r) | |
369 | Right (chunk, p') -> | |
370 | if B.null chunk | |
371 | then go p' | |
372 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | |
373 | yield (T.pack (B8.unpack safe)) | |
374 | if B.null unsafe | |
375 | then go p' | |
376 | else return (do yield unsafe | |
377 | p') | |
378 | ||
379 | ||
380 |