]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
f26f168e7847935a02dda61ce2291a2a31436ef4
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7 -- imported unqualified.
8
9 module Pipes.Text.Encoding
10 (
11 -- * The Lens or Codec type
12 -- $lenses
13 Codec
14 , decode
15 , eof
16 -- * \'Viewing\' the Text in a byte stream
17 -- $codecs
18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
24 -- * Non-lens decoding functions
25 -- $decoders
26 , decodeUtf8
27 , decodeUtf8Pure
28 , decodeUtf16LE
29 , decodeUtf16BE
30 , decodeUtf32LE
31 , decodeUtf32BE
32 -- * Re-encoding functions
33 -- $encoders
34 , encodeUtf8
35 , encodeUtf16LE
36 , encodeUtf16BE
37 , encodeUtf32LE
38 , encodeUtf32BE
39 -- * Functions for latin and ascii text
40 -- $ascii
41 , encodeAscii
42 , decodeAscii
43 , encodeIso8859_1
44 , decodeIso8859_1
45 )
46 where
47
48 import Data.Functor.Constant (Constant(..))
49 import Data.Char (ord)
50 import Data.ByteString as B
51 import Data.ByteString (ByteString)
52 import Data.ByteString.Char8 as B8
53 import Data.Text (Text)
54 import qualified Data.Text as T
55 import qualified Data.Text.Encoding as TE
56 import qualified Data.Streaming.Text as Stream
57 import Data.Streaming.Text (DecodeResult(..))
58 import Control.Monad (join)
59 import Data.Word (Word8)
60 import Pipes
61
62 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
63
64 {- $lenses
65 The 'Codec' type is a simple specializion of
66 the @Lens'@ type synonymn used by the standard lens libraries,
67 <http://hackage.haskell.org/package/lens lens> and
68 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
69
70 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
71
72 is just an alias for a Prelude type. Thus you use any particular codec with
73 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
74 we presuppose neither since we already have access to the types they require.
75
76 -}
77
78 type Codec
79 = forall m r
80 . Monad m
81 => Lens' (Producer ByteString m r)
82 (Producer Text m (Producer ByteString m r))
83
84 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
85 exported here under a name appropriate to the material. All of these are
86 the same:
87
88 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
89
90 -}
91
92
93 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
94 decode codec a = getConstant (codec Constant a)
95
96 {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
97 reaches end-of-file happily. (Without it one just makes an explicit test
98 for emptiness of the resulting bytestring production using 'next')
99 Thus
100
101 > decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof
102
103 will be a text producer. If we hit undecodable bytes, the remaining
104 bytestring producer will be returned as a 'Left' value;
105 in the happy case, a 'Right' value is returned with the anticipated
106 return value for the original bytestring producer.
107 )
108
109 -}
110
111 eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
112 (Producer Text m (Either (Producer ByteString m r) r))
113 eof k p = fmap fromEither (k (toEither p)) where
114
115 fromEither = liftM (either id return)
116
117 toEither pp = do p <- pp
118 check p
119
120 check p = do e <- lift (next p)
121 case e of
122 Left r -> return (Right r)
123 Right (bs,pb) -> if B.null bs
124 then check pb
125 else return (Left (do yield bs
126 pb))
127
128
129 {- $codecs
130
131 Each Codec-lens looks into a byte stream that is supposed to contain text.
132 The particular \'Codec\' lenses are named in accordance with the expected
133 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
134 use @view@ / @(^.)@ -- here also called 'decode':
135
136 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
137 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
138 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
139
140 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
141 decoding functions exported here, e.g.
142
143 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
144 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
145
146 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
147 At any point of decoding failure, the stream of text ends and reverts to (returns)
148 the original byte stream. Thus if the first bytes are already
149 un-decodable, the whole ByteString producer will be returned, i.e.
150
151 > view utf8 bytestream
152
153 will just come to the same as
154
155 > return bytestream
156
157 Where there is no decoding failure, the return value of the text stream will be
158 an empty byte stream followed by its own return value. In all cases you must
159 deal with the fact that it is a /ByteString producer/ that is returned, even if
160 it can be thrown away with @Control.Monad.void@
161
162 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
163
164 @zoom@ converts a Text parser into a ByteString parser:
165
166 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
167
168 or, using the type synonymn from @Pipes.Parse@:
169
170 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
171
172 Thus we can define a ByteString parser like this:
173
174 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
175 > withNextByte = do char_ <- zoom utf8 Text.drawChar
176 > byte_ <- Bytes.peekByte
177 > return (char_, byte_)
178
179 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
180 but it is a ByteString parser; it will return the first valid utf8-encoded
181 Char in a ByteString, whatever its length,
182 and the first byte of the next character, if they exist. Because
183 we \'draw\' one and \'peek\' at the other, the parser as a whole only
184 advances one Char's length along the bytestring, whatever that length may be.
185 See the slightly more complex example \'decode.hs\' in the
186 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
187 discussion of this type of byte stream parsing.
188 -}
189
190 utf8 :: Codec
191 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
192
193 utf8Pure :: Codec
194 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
195
196 utf16LE :: Codec
197 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
198
199 utf16BE :: Codec
200 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
201
202 utf32LE :: Codec
203 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
204
205 utf32BE :: Codec
206 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
207
208 decodeStream :: Monad m
209 => (B.ByteString -> DecodeResult)
210 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
211 decodeStream = loop where
212 loop dec0 p =
213 do x <- lift (next p)
214 case x of Left r -> return (return r)
215 Right (chunk, p') -> case dec0 chunk of
216 DecodeResultSuccess text dec -> do yield text
217 loop dec p'
218 DecodeResultFailure text bs -> do yield text
219 return (do yield bs
220 p')
221 {-# INLINABLE decodeStream#-}
222
223
224 {- $decoders
225 These are functions with the simple type:
226
227 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
228
229 Thus in general
230
231 > decodeUtf8 = view utf8
232 > decodeUtf16LE = view utf16LE
233
234 and so forth, but these forms
235 may be more convenient (and give better type errors!) where lenses are
236 not desired.
237 -}
238
239
240 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
241 decodeUtf8 = decodeStream Stream.decodeUtf8
242 {-# INLINE decodeUtf8 #-}
243
244 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
245 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
246 {-# INLINE decodeUtf8Pure #-}
247
248 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
249 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
250 {-# INLINE decodeUtf16LE #-}
251
252 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
253 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
254 {-# INLINE decodeUtf16BE #-}
255
256 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
257 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
258 {-# INLINE decodeUtf32LE #-}
259
260 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
261 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
262 {-# INLINE decodeUtf32BE #-}
263
264
265 {- $encoders
266 These are simply defined
267
268 > encodeUtf8 = yield . TE.encodeUtf8
269
270 They are intended for use with 'for'
271
272 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
273
274 which would have the effect of
275
276 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
277
278 using the encoding functions from Data.Text.Encoding
279 -}
280
281 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
282 encodeUtf8 = yield . TE.encodeUtf8
283 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
284 encodeUtf16LE = yield . TE.encodeUtf16LE
285 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
286 encodeUtf16BE = yield . TE.encodeUtf16BE
287 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
288 encodeUtf32LE = yield . TE.encodeUtf32LE
289 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
290 encodeUtf32BE = yield . TE.encodeUtf32BE
291
292 mkCodec :: (forall r m . Monad m =>
293 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
294 -> (Text -> ByteString)
295 -> Codec
296 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
297
298
299
300 {- $ascii
301 ascii and latin encodings only use a small number of the characters 'Text'
302 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
303 Rather we simply define functions each way.
304 -}
305
306
307 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
308 -- returning the rest of the 'Text' at the first non-ascii 'Char'
309
310 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
311 encodeAscii = go where
312 go p = do e <- lift (next p)
313 case e of
314 Left r -> return (return r)
315 Right (chunk, p') ->
316 if T.null chunk
317 then go p'
318 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
319 in do yield (B8.pack (T.unpack safe))
320 if T.null unsafe
321 then go p'
322 else return $ do yield unsafe
323 p'
324
325 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
326 returning the rest of the 'Text' upon hitting any non-latin 'Char'
327 -}
328 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
329 encodeIso8859_1 = go where
330 go p = do e <- lift (next p)
331 case e of
332 Left r -> return (return r)
333 Right (txt, p') ->
334 if T.null txt
335 then go p'
336 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
337 in do yield (B8.pack (T.unpack safe))
338 if T.null unsafe
339 then go p'
340 else return $ do yield unsafe
341 p'
342
343 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
344 unused 'ByteString' upon hitting an un-ascii byte.
345 -}
346 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
347 decodeAscii = go where
348 go p = do e <- lift (next p)
349 case e of
350 Left r -> return (return r)
351 Right (chunk, p') ->
352 if B.null chunk
353 then go p'
354 else let (safe, unsafe) = B.span (<= 0x7F) chunk
355 in do yield (T.pack (B8.unpack safe))
356 if B.null unsafe
357 then go p'
358 else return (do yield unsafe
359 p')
360
361 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
362 unused 'ByteString' upon hitting the rare un-latinizable byte.
363 -}
364 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
365 decodeIso8859_1 = go where
366 go p = do e <- lift (next p)
367 case e of
368 Left r -> return (return r)
369 Right (chunk, p') ->
370 if B.null chunk
371 then go p'
372 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
373 yield (T.pack (B8.unpack safe))
374 if B.null unsafe
375 then go p'
376 else return (do yield unsafe
377 p')
378
379
380