]>
Commit | Line | Data |
---|---|---|
bbdfd305 | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
89d80557 | 2 | |
0ac0c414 | 3 | -- | This module uses the stream decoding functions from Michael Snoyman's new |
4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> | |
5 | -- package to define decoding functions and lenses. | |
bbdfd305 | 6 | |
7 | module Pipes.Text.Encoding | |
fafcbeb5 | 8 | ( |
0ac0c414 | 9 | -- * The Lens or Codec type |
fafcbeb5 | 10 | -- $lenses |
11 | Codec | |
0ac0c414 | 12 | -- * Viewing the Text in a ByteString |
fafcbeb5 | 13 | -- $codecs |
0ac0c414 | 14 | , decode |
bbdfd305 | 15 | , utf8 |
16 | , utf8Pure | |
17 | , utf16LE | |
18 | , utf16BE | |
19 | , utf32LE | |
20 | , utf32BE | |
fafcbeb5 | 21 | -- * Non-lens decoding functions |
0ac0c414 | 22 | -- $decoders |
89d80557 | 23 | , decodeUtf8 |
24 | , decodeUtf8Pure | |
25 | , decodeUtf16LE | |
26 | , decodeUtf16BE | |
27 | , decodeUtf32LE | |
28 | , decodeUtf32BE | |
0ac0c414 | 29 | -- * Re-encoding functions |
30 | -- $encoders | |
31 | , encodeUtf8 | |
32 | , encodeUtf16LE | |
33 | , encodeUtf16BE | |
34 | , encodeUtf32LE | |
35 | , encodeUtf32BE | |
fafcbeb5 | 36 | -- * Functions for latin and ascii text |
37 | -- $ascii | |
bbdfd305 | 38 | , encodeAscii |
39 | , decodeAscii | |
40 | , encodeIso8859_1 | |
41 | , decodeIso8859_1 | |
42 | ) | |
43 | where | |
44 | ||
0ac0c414 | 45 | import Data.Functor.Constant (Constant(..)) |
bbdfd305 | 46 | import Data.Char (ord) |
47 | import Data.ByteString as B | |
48 | import Data.ByteString (ByteString) | |
bbdfd305 | 49 | import Data.ByteString.Char8 as B8 |
50 | import Data.Text (Text) | |
51 | import qualified Data.Text as T | |
52 | import qualified Data.Text.Encoding as TE | |
53 | import Data.Text.StreamDecoding | |
70125641 | 54 | import Control.Monad (join) |
89d80557 | 55 | import Data.Word (Word8) |
bbdfd305 | 56 | import Pipes |
bbdfd305 | 57 | |
fafcbeb5 | 58 | |
21eb409c | 59 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
bbdfd305 | 60 | |
fafcbeb5 | 61 | {- $lenses |
0ac0c414 | 62 | The 'Codec' type is a simple specializion of |
63 | the @Lens'@ type synonymn used by the standard lens libraries, | |
64 | <http://hackage.haskell.org/package/lens lens> and | |
65 | <http://hackage.haskell.org/package/lens-family lens-family>. That type, | |
fafcbeb5 | 66 | |
0ac0c414 | 67 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
68 | ||
69 | is just an alias for an ordinary Prelude type. Thus you use any codec with | |
70 | the @view@ / @(^.)@ and @zoom@ functions from those libraries. | |
71 | ||
fafcbeb5 | 72 | -} |
73 | ||
21eb409c GG |
74 | type Codec |
75 | = forall m r | |
76 | . Monad m | |
77 | => Lens' (Producer ByteString m r) | |
78 | (Producer Text m (Producer ByteString m r)) | |
0ac0c414 | 79 | |
80 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; | |
81 | exported here for convience | |
82 | ||
83 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf | |
84 | ||
85 | -} | |
86 | ||
87 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b | |
88 | decode codec a = getConstant (codec Constant a) | |
89 | ||
bbdfd305 | 90 | |
91 | decodeStream :: Monad m | |
92 | => (B.ByteString -> DecodeResult) | |
93 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
94 | decodeStream = loop where | |
95 | loop dec0 p = | |
96 | do x <- lift (next p) | |
97 | case x of Left r -> return (return r) | |
98 | Right (chunk, p') -> case dec0 chunk of | |
99 | DecodeResultSuccess text dec -> do yield text | |
100 | loop dec p' | |
101 | DecodeResultFailure text bs -> do yield text | |
102 | return (do yield bs | |
103 | p') | |
104 | {-# INLINABLE decodeStream#-} | |
105 | ||
0ac0c414 | 106 | {- $decoders |
107 | These are functions with the simple type: | |
108 | ||
109 | > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
110 | ||
111 | Thus in general | |
112 | ||
113 | > decodeUtf8 = view utf8 | |
114 | > decodeUtf16LE = view utf16LE | |
fafcbeb5 | 115 | |
0ac0c414 | 116 | and so forth, but these forms |
117 | may be more convenient (and give better type errors!) where lenses are | |
118 | not desired. | |
119 | -} | |
fafcbeb5 | 120 | |
121 | ||
bbdfd305 | 122 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
123 | decodeUtf8 = decodeStream streamUtf8 | |
124 | {-# INLINE decodeUtf8 #-} | |
125 | ||
126 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
127 | decodeUtf8Pure = decodeStream streamUtf8Pure | |
128 | {-# INLINE decodeUtf8Pure #-} | |
129 | ||
130 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
131 | decodeUtf16LE = decodeStream streamUtf16LE | |
132 | {-# INLINE decodeUtf16LE #-} | |
133 | ||
134 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
135 | decodeUtf16BE = decodeStream streamUtf16BE | |
136 | {-# INLINE decodeUtf16BE #-} | |
137 | ||
138 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
139 | decodeUtf32LE = decodeStream streamUtf32LE | |
140 | {-# INLINE decodeUtf32LE #-} | |
141 | ||
142 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
143 | decodeUtf32BE = decodeStream streamUtf32BE | |
144 | {-# INLINE decodeUtf32BE #-} | |
145 | ||
0ac0c414 | 146 | |
147 | {- $encoders | |
148 | These are simply defined | |
149 | ||
150 | > encodeUtf8 = yield . TE.encodeUtf8 | |
151 | ||
152 | They are intended for use with 'for' | |
153 | ||
154 | > for Text.stdin encodeUtf8 :: Producer ByteString IO () | |
155 | ||
156 | which would have the effect of | |
157 | ||
158 | > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8) | |
159 | ||
160 | using the encoding functions from Data.Text.Encoding | |
161 | -} | |
162 | ||
163 | encodeUtf8 :: Monad m => Text -> Producer ByteString m () | |
164 | encodeUtf8 = yield . TE.encodeUtf8 | |
165 | encodeUtf16LE :: Monad m => Text -> Producer ByteString m () | |
166 | encodeUtf16LE = yield . TE.encodeUtf16LE | |
167 | encodeUtf16BE :: Monad m => Text -> Producer ByteString m () | |
168 | encodeUtf16BE = yield . TE.encodeUtf16BE | |
169 | encodeUtf32LE :: Monad m => Text -> Producer ByteString m () | |
170 | encodeUtf32LE = yield . TE.encodeUtf32LE | |
171 | encodeUtf32BE :: Monad m => Text -> Producer ByteString m () | |
172 | encodeUtf32BE = yield . TE.encodeUtf32BE | |
173 | ||
bbdfd305 | 174 | mkCodec :: (forall r m . Monad m => |
175 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | |
176 | -> (Text -> ByteString) | |
177 | -> Codec | |
178 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |
179 | ||
180 | ||
fafcbeb5 | 181 | {- $codecs |
182 | ||
0ac0c414 | 183 | Each codec/lens looks into a byte stream that is supposed to contain text. |
184 | The particular \'Codec\' lenses are named in accordance with the expected | |
185 | encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' -- | |
186 | turns a Codec into a function: | |
fafcbeb5 | 187 | |
188 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
0ac0c414 | 189 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) |
fafcbeb5 | 190 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) |
191 | ||
0ac0c414 | 192 | Uses of a codec with @view@ / @(^.)@ / 'decode' can always be replaced by the specialized |
193 | decoding functions exported here, e.g. | |
194 | ||
195 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
196 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | |
197 | ||
198 | The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head. | |
199 | At any point of decoding failure, the stream of text ends and reverts to (returns) | |
200 | the original byte stream. Thus if the first bytes are already | |
201 | un-decodable, the whole ByteString producer will be returned, i.e. | |
202 | ||
203 | > view utf8 bytestream | |
204 | ||
205 | will just come to the same as | |
206 | ||
207 | > return bytestream | |
208 | ||
209 | Where there is no decoding failure, the return value of the text stream will be | |
210 | an empty byte stream followed by its own return value. In all cases you must | |
211 | deal with the fact that it is a ByteString producer that is returned, even if | |
212 | it can be thrown away with @Control.Monad.void@ | |
213 | ||
214 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | |
215 | ||
fafcbeb5 | 216 | @zoom@ converts a Text parser into a ByteString parser: |
217 | ||
218 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | |
219 | > | |
220 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | |
221 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | |
222 | > byte_ <- Bytes.peekByte | |
223 | > return (char_, byte_) | |
224 | ||
225 | @withNextByte@ will return the first valid Char in a ByteString, | |
226 | and the first byte of the next character, if they exists. Because | |
227 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | |
228 | advances one Char's length along the bytestring. | |
229 | ||
230 | -} | |
bbdfd305 | 231 | |
232 | utf8 :: Codec | |
233 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | |
234 | ||
235 | utf8Pure :: Codec | |
236 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | |
237 | ||
238 | utf16LE :: Codec | |
239 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | |
240 | ||
241 | utf16BE :: Codec | |
242 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | |
243 | ||
244 | utf32LE :: Codec | |
245 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | |
246 | ||
247 | utf32BE :: Codec | |
248 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |
249 | ||
250 | ||
fafcbeb5 | 251 | {- $ascii |
252 | ascii and latin encodings only use a small number of the characters 'Text' | |
253 | recognizes; thus we cannot use the pipes @Lens@ style to work with them. | |
bbdfd305 | 254 | Rather we simply define functions each way. |
bbdfd305 | 255 | -} |
256 | ||
fafcbeb5 | 257 | |
0ac0c414 | 258 | -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, |
fafcbeb5 | 259 | -- returning the rest of the 'Text' at the first non-ascii 'Char' |
260 | ||
bbdfd305 | 261 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
262 | encodeAscii = go where | |
263 | go p = do e <- lift (next p) | |
264 | case e of | |
265 | Left r -> return (return r) | |
266 | Right (chunk, p') -> | |
267 | if T.null chunk | |
268 | then go p' | |
269 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
270 | in do yield (B8.pack (T.unpack safe)) | |
271 | if T.null unsafe | |
272 | then go p' | |
273 | else return $ do yield unsafe | |
274 | p' | |
275 | ||
276 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | |
277 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | |
278 | -} | |
279 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
280 | encodeIso8859_1 = go where | |
281 | go p = do e <- lift (next p) | |
282 | case e of | |
283 | Left r -> return (return r) | |
284 | Right (txt, p') -> | |
285 | if T.null txt | |
286 | then go p' | |
287 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | |
288 | in do yield (B8.pack (T.unpack safe)) | |
289 | if T.null unsafe | |
290 | then go p' | |
291 | else return $ do yield unsafe | |
292 | p' | |
293 | ||
294 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
295 | unused 'ByteString' upon hitting an un-ascii byte. | |
296 | -} | |
297 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
298 | decodeAscii = go where | |
299 | go p = do e <- lift (next p) | |
300 | case e of | |
301 | Left r -> return (return r) | |
302 | Right (chunk, p') -> | |
303 | if B.null chunk | |
304 | then go p' | |
305 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | |
306 | in do yield (T.pack (B8.unpack safe)) | |
307 | if B.null unsafe | |
308 | then go p' | |
309 | else return (do yield unsafe | |
310 | p') | |
311 | ||
312 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
313 | unused 'ByteString' upon hitting the rare un-latinizable byte. | |
314 | -} | |
315 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
316 | decodeIso8859_1 = go where | |
317 | go p = do e <- lift (next p) | |
318 | case e of | |
319 | Left r -> return (return r) | |
320 | Right (chunk, p') -> | |
321 | if B.null chunk | |
322 | then go p' | |
323 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | |
324 | yield (T.pack (B8.unpack safe)) | |
325 | if B.null unsafe | |
326 | then go p' | |
327 | else return (do yield unsafe | |
328 | p') | |
329 | ||
330 | ||
331 |