]>
Commit | Line | Data |
---|---|---|
bbdfd305 | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
89d80557 | 2 | |
b091cbeb | 3 | -- | This module uses the stream decoding functions from |
0ac0c414 | 4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> |
4ea59a8b | 5 | -- package to define decoding functions and lenses. The exported names |
b091cbeb | 6 | -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ |
bbdfd305 | 7 | |
8 | module Pipes.Text.Encoding | |
fafcbeb5 | 9 | ( |
0ac0c414 | 10 | -- * The Lens or Codec type |
fafcbeb5 | 11 | -- $lenses |
12 | Codec | |
4ea59a8b | 13 | , decode |
6c2fffdc | 14 | , eof |
a4913c42 | 15 | -- * \'Viewing\' the Text in a byte stream |
fafcbeb5 | 16 | -- $codecs |
bbdfd305 | 17 | , utf8 |
18 | , utf8Pure | |
19 | , utf16LE | |
20 | , utf16BE | |
21 | , utf32LE | |
22 | , utf32BE | |
fafcbeb5 | 23 | -- * Non-lens decoding functions |
0ac0c414 | 24 | -- $decoders |
89d80557 | 25 | , decodeUtf8 |
26 | , decodeUtf8Pure | |
27 | , decodeUtf16LE | |
28 | , decodeUtf16BE | |
29 | , decodeUtf32LE | |
30 | , decodeUtf32BE | |
0ac0c414 | 31 | -- * Re-encoding functions |
32 | -- $encoders | |
33 | , encodeUtf8 | |
34 | , encodeUtf16LE | |
35 | , encodeUtf16BE | |
36 | , encodeUtf32LE | |
37 | , encodeUtf32BE | |
fafcbeb5 | 38 | -- * Functions for latin and ascii text |
39 | -- $ascii | |
bbdfd305 | 40 | , encodeAscii |
41 | , decodeAscii | |
42 | , encodeIso8859_1 | |
43 | , decodeIso8859_1 | |
44 | ) | |
45 | where | |
46 | ||
0ac0c414 | 47 | import Data.Functor.Constant (Constant(..)) |
bbdfd305 | 48 | import Data.Char (ord) |
49 | import Data.ByteString as B | |
50 | import Data.ByteString (ByteString) | |
bbdfd305 | 51 | import Data.ByteString.Char8 as B8 |
52 | import Data.Text (Text) | |
53 | import qualified Data.Text as T | |
54 | import qualified Data.Text.Encoding as TE | |
eae50557 | 55 | import qualified Data.Streaming.Text as Stream |
56 | import Data.Streaming.Text (DecodeResult(..)) | |
b091cbeb | 57 | import Control.Monad (join, liftM) |
89d80557 | 58 | import Data.Word (Word8) |
bbdfd305 | 59 | import Pipes |
bbdfd305 | 60 | |
2f4a83f8 | 61 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
bbdfd305 | 62 | |
fafcbeb5 | 63 | {- $lenses |
0ac0c414 | 64 | The 'Codec' type is a simple specializion of |
2f4a83f8 | 65 | the @Lens'@ type synonymn used by the standard lens libraries, |
0ac0c414 | 66 | <http://hackage.haskell.org/package/lens lens> and |
67 | <http://hackage.haskell.org/package/lens-family lens-family>. That type, | |
fafcbeb5 | 68 | |
2f4a83f8 | 69 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
0ac0c414 | 70 | |
4ea59a8b | 71 | is just an alias for a Prelude type. Thus you use any particular codec with |
72 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries; | |
b091cbeb | 73 | we presuppose neither library since we already have access to the types they require. |
fafcbeb5 | 74 | -} |
75 | ||
21eb409c | 76 | type Codec |
d199072b | 77 | = forall m r |
21eb409c | 78 | . Monad m |
2f4a83f8 | 79 | => Lens' (Producer ByteString m r) |
d199072b | 80 | (Producer Text m (Producer ByteString m r)) |
81 | ||
0ac0c414 | 82 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; |
4ea59a8b | 83 | exported here under a name appropriate to the material. All of these are |
84 | the same: | |
0ac0c414 | 85 | |
4ea59a8b | 86 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8 |
0ac0c414 | 87 | |
88 | -} | |
89 | ||
6c2fffdc | 90 | |
0ac0c414 | 91 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b |
92 | decode codec a = getConstant (codec Constant a) | |
93 | ||
6c2fffdc | 94 | {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead |
95 | reaches end-of-file happily. (Without it one just makes an explicit test | |
96 | for emptiness of the resulting bytestring production using 'next') | |
97 | Thus | |
98 | ||
99 | > decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof | |
100 | ||
101 | will be a text producer. If we hit undecodable bytes, the remaining | |
102 | bytestring producer will be returned as a 'Left' value; | |
103 | in the happy case, a 'Right' value is returned with the anticipated | |
104 | return value for the original bytestring producer. | |
105 | ) | |
106 | ||
107 | -} | |
108 | ||
109 | eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r)) | |
110 | (Producer Text m (Either (Producer ByteString m r) r)) | |
111 | eof k p = fmap fromEither (k (toEither p)) where | |
112 | ||
113 | fromEither = liftM (either id return) | |
114 | ||
115 | toEither pp = do p <- pp | |
116 | check p | |
117 | ||
118 | check p = do e <- lift (next p) | |
119 | case e of | |
120 | Left r -> return (Right r) | |
121 | Right (bs,pb) -> if B.null bs | |
122 | then check pb | |
123 | else return (Left (do yield bs | |
124 | pb)) | |
125 | ||
bbdfd305 | 126 | |
4ea59a8b | 127 | {- $codecs |
128 | ||
129 | Each Codec-lens looks into a byte stream that is supposed to contain text. | |
130 | The particular \'Codec\' lenses are named in accordance with the expected | |
131 | encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function, | |
132 | use @view@ / @(^.)@ -- here also called 'decode': | |
133 | ||
134 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
135 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
136 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | |
137 | ||
138 | Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized | |
139 | decoding functions exported here, e.g. | |
140 | ||
141 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
142 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
143 | ||
144 | The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. | |
145 | At any point of decoding failure, the stream of text ends and reverts to (returns) | |
146 | the original byte stream. Thus if the first bytes are already | |
147 | un-decodable, the whole ByteString producer will be returned, i.e. | |
148 | ||
149 | > view utf8 bytestream | |
150 | ||
151 | will just come to the same as | |
152 | ||
153 | > return bytestream | |
154 | ||
155 | Where there is no decoding failure, the return value of the text stream will be | |
156 | an empty byte stream followed by its own return value. In all cases you must | |
157 | deal with the fact that it is a /ByteString producer/ that is returned, even if | |
158 | it can be thrown away with @Control.Monad.void@ | |
159 | ||
160 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | |
161 | ||
162 | @zoom@ converts a Text parser into a ByteString parser: | |
163 | ||
164 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | |
165 | ||
a4913c42 | 166 | or, using the type synonymn from @Pipes.Parse@: |
4ea59a8b | 167 | |
168 | > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) | |
169 | ||
a4913c42 | 170 | Thus we can define a ByteString parser like this: |
4ea59a8b | 171 | |
172 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | |
173 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | |
174 | > byte_ <- Bytes.peekByte | |
175 | > return (char_, byte_) | |
176 | ||
177 | Though @withNextByte@ is partly defined with a Text parser 'drawChar'; | |
178 | but it is a ByteString parser; it will return the first valid utf8-encoded | |
179 | Char in a ByteString, whatever its length, | |
180 | and the first byte of the next character, if they exist. Because | |
181 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | |
182 | advances one Char's length along the bytestring, whatever that length may be. | |
183 | See the slightly more complex example \'decode.hs\' in the | |
184 | <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> | |
185 | discussion of this type of byte stream parsing. | |
186 | -} | |
187 | ||
188 | utf8 :: Codec | |
189 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | |
190 | ||
191 | utf8Pure :: Codec | |
192 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | |
193 | ||
194 | utf16LE :: Codec | |
195 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | |
196 | ||
197 | utf16BE :: Codec | |
198 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | |
199 | ||
200 | utf32LE :: Codec | |
201 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | |
202 | ||
203 | utf32BE :: Codec | |
204 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |
205 | ||
bbdfd305 | 206 | decodeStream :: Monad m |
207 | => (B.ByteString -> DecodeResult) | |
208 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
209 | decodeStream = loop where | |
210 | loop dec0 p = | |
211 | do x <- lift (next p) | |
212 | case x of Left r -> return (return r) | |
213 | Right (chunk, p') -> case dec0 chunk of | |
214 | DecodeResultSuccess text dec -> do yield text | |
215 | loop dec p' | |
216 | DecodeResultFailure text bs -> do yield text | |
217 | return (do yield bs | |
218 | p') | |
219 | {-# INLINABLE decodeStream#-} | |
220 | ||
6c2fffdc | 221 | |
0ac0c414 | 222 | {- $decoders |
223 | These are functions with the simple type: | |
224 | ||
225 | > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
226 | ||
227 | Thus in general | |
228 | ||
229 | > decodeUtf8 = view utf8 | |
230 | > decodeUtf16LE = view utf16LE | |
fafcbeb5 | 231 | |
0ac0c414 | 232 | and so forth, but these forms |
233 | may be more convenient (and give better type errors!) where lenses are | |
234 | not desired. | |
235 | -} | |
fafcbeb5 | 236 | |
237 | ||
bbdfd305 | 238 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
eae50557 | 239 | decodeUtf8 = decodeStream Stream.decodeUtf8 |
bbdfd305 | 240 | {-# INLINE decodeUtf8 #-} |
241 | ||
242 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 243 | decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure |
bbdfd305 | 244 | {-# INLINE decodeUtf8Pure #-} |
245 | ||
246 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 247 | decodeUtf16LE = decodeStream Stream.decodeUtf16LE |
bbdfd305 | 248 | {-# INLINE decodeUtf16LE #-} |
249 | ||
250 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 251 | decodeUtf16BE = decodeStream Stream.decodeUtf16BE |
bbdfd305 | 252 | {-# INLINE decodeUtf16BE #-} |
253 | ||
254 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 255 | decodeUtf32LE = decodeStream Stream.decodeUtf32LE |
bbdfd305 | 256 | {-# INLINE decodeUtf32LE #-} |
257 | ||
258 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
eae50557 | 259 | decodeUtf32BE = decodeStream Stream.decodeUtf32BE |
bbdfd305 | 260 | {-# INLINE decodeUtf32BE #-} |
261 | ||
0ac0c414 | 262 | |
263 | {- $encoders | |
264 | These are simply defined | |
265 | ||
266 | > encodeUtf8 = yield . TE.encodeUtf8 | |
267 | ||
268 | They are intended for use with 'for' | |
269 | ||
270 | > for Text.stdin encodeUtf8 :: Producer ByteString IO () | |
271 | ||
272 | which would have the effect of | |
273 | ||
274 | > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8) | |
275 | ||
276 | using the encoding functions from Data.Text.Encoding | |
277 | -} | |
278 | ||
10cfd90e | 279 | encodeUtf8 :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 280 | encodeUtf8 = yield . TE.encodeUtf8 |
10cfd90e | 281 | encodeUtf16LE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 282 | encodeUtf16LE = yield . TE.encodeUtf16LE |
10cfd90e | 283 | encodeUtf16BE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 284 | encodeUtf16BE = yield . TE.encodeUtf16BE |
10cfd90e | 285 | encodeUtf32LE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 286 | encodeUtf32LE = yield . TE.encodeUtf32LE |
10cfd90e | 287 | encodeUtf32BE :: Monad m => Text -> Producer' ByteString m () |
0ac0c414 | 288 | encodeUtf32BE = yield . TE.encodeUtf32BE |
289 | ||
bbdfd305 | 290 | mkCodec :: (forall r m . Monad m => |
291 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | |
292 | -> (Text -> ByteString) | |
293 | -> Codec | |
294 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |
295 | ||
296 | ||
bbdfd305 | 297 | |
fafcbeb5 | 298 | {- $ascii |
299 | ascii and latin encodings only use a small number of the characters 'Text' | |
300 | recognizes; thus we cannot use the pipes @Lens@ style to work with them. | |
bbdfd305 | 301 | Rather we simply define functions each way. |
bbdfd305 | 302 | -} |
303 | ||
fafcbeb5 | 304 | |
0ac0c414 | 305 | -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, |
fafcbeb5 | 306 | -- returning the rest of the 'Text' at the first non-ascii 'Char' |
307 | ||
bbdfd305 | 308 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
309 | encodeAscii = go where | |
310 | go p = do e <- lift (next p) | |
311 | case e of | |
312 | Left r -> return (return r) | |
313 | Right (chunk, p') -> | |
314 | if T.null chunk | |
315 | then go p' | |
316 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
317 | in do yield (B8.pack (T.unpack safe)) | |
318 | if T.null unsafe | |
319 | then go p' | |
320 | else return $ do yield unsafe | |
321 | p' | |
322 | ||
323 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | |
324 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | |
325 | -} | |
326 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
327 | encodeIso8859_1 = go where | |
328 | go p = do e <- lift (next p) | |
329 | case e of | |
330 | Left r -> return (return r) | |
331 | Right (txt, p') -> | |
332 | if T.null txt | |
333 | then go p' | |
334 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | |
335 | in do yield (B8.pack (T.unpack safe)) | |
336 | if T.null unsafe | |
337 | then go p' | |
338 | else return $ do yield unsafe | |
339 | p' | |
340 | ||
341 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
342 | unused 'ByteString' upon hitting an un-ascii byte. | |
343 | -} | |
344 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
345 | decodeAscii = go where | |
346 | go p = do e <- lift (next p) | |
347 | case e of | |
348 | Left r -> return (return r) | |
349 | Right (chunk, p') -> | |
350 | if B.null chunk | |
351 | then go p' | |
352 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | |
353 | in do yield (T.pack (B8.unpack safe)) | |
354 | if B.null unsafe | |
355 | then go p' | |
356 | else return (do yield unsafe | |
357 | p') | |
358 | ||
359 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
360 | unused 'ByteString' upon hitting the rare un-latinizable byte. | |
361 | -} | |
362 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
363 | decodeIso8859_1 = go where | |
364 | go p = do e <- lift (next p) | |
365 | case e of | |
366 | Left r -> return (return r) | |
367 | Right (chunk, p') -> | |
368 | if B.null chunk | |
369 | then go p' | |
370 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | |
371 | yield (T.pack (B8.unpack safe)) | |
372 | if B.null unsafe | |
373 | then go p' | |
374 | else return (do yield unsafe | |
375 | p') | |
376 | ||
377 | ||
378 |