]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
more documentation
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
0ac0c414 3-- | This module uses the stream decoding functions from Michael Snoyman's new
4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5-- package to define decoding functions and lenses.
bbdfd305 6
7module Pipes.Text.Encoding
fafcbeb5 8 (
0ac0c414 9 -- * The Lens or Codec type
fafcbeb5 10 -- $lenses
11 Codec
0ac0c414 12 -- * Viewing the Text in a ByteString
fafcbeb5 13 -- $codecs
0ac0c414 14 , decode
bbdfd305 15 , utf8
16 , utf8Pure
17 , utf16LE
18 , utf16BE
19 , utf32LE
20 , utf32BE
fafcbeb5 21 -- * Non-lens decoding functions
0ac0c414 22 -- $decoders
89d80557 23 , decodeUtf8
24 , decodeUtf8Pure
25 , decodeUtf16LE
26 , decodeUtf16BE
27 , decodeUtf32LE
28 , decodeUtf32BE
0ac0c414 29 -- * Re-encoding functions
30 -- $encoders
31 , encodeUtf8
32 , encodeUtf16LE
33 , encodeUtf16BE
34 , encodeUtf32LE
35 , encodeUtf32BE
fafcbeb5 36 -- * Functions for latin and ascii text
37 -- $ascii
bbdfd305 38 , encodeAscii
39 , decodeAscii
40 , encodeIso8859_1
41 , decodeIso8859_1
42 )
43 where
44
0ac0c414 45import Data.Functor.Constant (Constant(..))
bbdfd305 46import Data.Char (ord)
47import Data.ByteString as B
48import Data.ByteString (ByteString)
bbdfd305 49import Data.ByteString.Char8 as B8
50import Data.Text (Text)
51import qualified Data.Text as T
52import qualified Data.Text.Encoding as TE
53import Data.Text.StreamDecoding
70125641 54import Control.Monad (join)
89d80557 55import Data.Word (Word8)
bbdfd305 56import Pipes
bbdfd305 57
fafcbeb5 58
21eb409c 59type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
bbdfd305 60
fafcbeb5 61{- $lenses
0ac0c414 62 The 'Codec' type is a simple specializion of
63 the @Lens'@ type synonymn used by the standard lens libraries,
64 <http://hackage.haskell.org/package/lens lens> and
65 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
fafcbeb5 66
0ac0c414 67> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
68
69 is just an alias for an ordinary Prelude type. Thus you use any codec with
70 the @view@ / @(^.)@ and @zoom@ functions from those libraries.
71
fafcbeb5 72 -}
73
21eb409c
GG
74type Codec
75 = forall m r
76 . Monad m
77 => Lens' (Producer ByteString m r)
78 (Producer Text m (Producer ByteString m r))
0ac0c414 79
80{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
81 exported here for convience
82
83> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf
84
85-}
86
87decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
88decode codec a = getConstant (codec Constant a)
89
bbdfd305 90
91decodeStream :: Monad m
92 => (B.ByteString -> DecodeResult)
93 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
94decodeStream = loop where
95 loop dec0 p =
96 do x <- lift (next p)
97 case x of Left r -> return (return r)
98 Right (chunk, p') -> case dec0 chunk of
99 DecodeResultSuccess text dec -> do yield text
100 loop dec p'
101 DecodeResultFailure text bs -> do yield text
102 return (do yield bs
103 p')
104{-# INLINABLE decodeStream#-}
105
0ac0c414 106{- $decoders
107 These are functions with the simple type:
108
109> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
110
111 Thus in general
112
113> decodeUtf8 = view utf8
114> decodeUtf16LE = view utf16LE
fafcbeb5 115
0ac0c414 116 and so forth, but these forms
117 may be more convenient (and give better type errors!) where lenses are
118 not desired.
119-}
fafcbeb5 120
121
bbdfd305 122decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
123decodeUtf8 = decodeStream streamUtf8
124{-# INLINE decodeUtf8 #-}
125
126decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
127decodeUtf8Pure = decodeStream streamUtf8Pure
128{-# INLINE decodeUtf8Pure #-}
129
130decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
131decodeUtf16LE = decodeStream streamUtf16LE
132{-# INLINE decodeUtf16LE #-}
133
134decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
135decodeUtf16BE = decodeStream streamUtf16BE
136{-# INLINE decodeUtf16BE #-}
137
138decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
139decodeUtf32LE = decodeStream streamUtf32LE
140{-# INLINE decodeUtf32LE #-}
141
142decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
143decodeUtf32BE = decodeStream streamUtf32BE
144{-# INLINE decodeUtf32BE #-}
145
0ac0c414 146
147{- $encoders
148 These are simply defined
149
150> encodeUtf8 = yield . TE.encodeUtf8
151
152 They are intended for use with 'for'
153
154> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
155
156 which would have the effect of
157
158> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
159
160 using the encoding functions from Data.Text.Encoding
161-}
162
163encodeUtf8 :: Monad m => Text -> Producer ByteString m ()
164encodeUtf8 = yield . TE.encodeUtf8
165encodeUtf16LE :: Monad m => Text -> Producer ByteString m ()
166encodeUtf16LE = yield . TE.encodeUtf16LE
167encodeUtf16BE :: Monad m => Text -> Producer ByteString m ()
168encodeUtf16BE = yield . TE.encodeUtf16BE
169encodeUtf32LE :: Monad m => Text -> Producer ByteString m ()
170encodeUtf32LE = yield . TE.encodeUtf32LE
171encodeUtf32BE :: Monad m => Text -> Producer ByteString m ()
172encodeUtf32BE = yield . TE.encodeUtf32BE
173
bbdfd305 174mkCodec :: (forall r m . Monad m =>
175 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
176 -> (Text -> ByteString)
177 -> Codec
178mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
179
180
fafcbeb5 181{- $codecs
182
0ac0c414 183 Each codec/lens looks into a byte stream that is supposed to contain text.
184 The particular \'Codec\' lenses are named in accordance with the expected
185 encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' --
186 turns a Codec into a function:
fafcbeb5 187
188> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
0ac0c414 189> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
fafcbeb5 190> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
191
0ac0c414 192 Uses of a codec with @view@ / @(^.)@ / 'decode' can always be replaced by the specialized
193 decoding functions exported here, e.g.
194
195> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
196> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
197
198 The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head.
199 At any point of decoding failure, the stream of text ends and reverts to (returns)
200 the original byte stream. Thus if the first bytes are already
201 un-decodable, the whole ByteString producer will be returned, i.e.
202
203> view utf8 bytestream
204
205 will just come to the same as
206
207> return bytestream
208
209 Where there is no decoding failure, the return value of the text stream will be
210 an empty byte stream followed by its own return value. In all cases you must
211 deal with the fact that it is a ByteString producer that is returned, even if
212 it can be thrown away with @Control.Monad.void@
213
214> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
215
fafcbeb5 216 @zoom@ converts a Text parser into a ByteString parser:
217
218> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
219>
220> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
221> withNextByte = do char_ <- zoom utf8 Text.drawChar
222> byte_ <- Bytes.peekByte
223> return (char_, byte_)
224
225 @withNextByte@ will return the first valid Char in a ByteString,
226 and the first byte of the next character, if they exists. Because
227 we \'draw\' one and \'peek\' at the other, the parser as a whole only
228 advances one Char's length along the bytestring.
229
230 -}
bbdfd305 231
232utf8 :: Codec
233utf8 = mkCodec decodeUtf8 TE.encodeUtf8
234
235utf8Pure :: Codec
236utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
237
238utf16LE :: Codec
239utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
240
241utf16BE :: Codec
242utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
243
244utf32LE :: Codec
245utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
246
247utf32BE :: Codec
248utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
249
250
fafcbeb5 251{- $ascii
252 ascii and latin encodings only use a small number of the characters 'Text'
253 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 254 Rather we simply define functions each way.
bbdfd305 255-}
256
fafcbeb5 257
0ac0c414 258-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 259-- returning the rest of the 'Text' at the first non-ascii 'Char'
260
bbdfd305 261encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
262encodeAscii = go where
263 go p = do e <- lift (next p)
264 case e of
265 Left r -> return (return r)
266 Right (chunk, p') ->
267 if T.null chunk
268 then go p'
269 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
270 in do yield (B8.pack (T.unpack safe))
271 if T.null unsafe
272 then go p'
273 else return $ do yield unsafe
274 p'
275
276{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
277 returning the rest of the 'Text' upon hitting any non-latin 'Char'
278 -}
279encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
280encodeIso8859_1 = go where
281 go p = do e <- lift (next p)
282 case e of
283 Left r -> return (return r)
284 Right (txt, p') ->
285 if T.null txt
286 then go p'
287 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
288 in do yield (B8.pack (T.unpack safe))
289 if T.null unsafe
290 then go p'
291 else return $ do yield unsafe
292 p'
293
294{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
295 unused 'ByteString' upon hitting an un-ascii byte.
296 -}
297decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
298decodeAscii = go where
299 go p = do e <- lift (next p)
300 case e of
301 Left r -> return (return r)
302 Right (chunk, p') ->
303 if B.null chunk
304 then go p'
305 else let (safe, unsafe) = B.span (<= 0x7F) chunk
306 in do yield (T.pack (B8.unpack safe))
307 if B.null unsafe
308 then go p'
309 else return (do yield unsafe
310 p')
311
312{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
313 unused 'ByteString' upon hitting the rare un-latinizable byte.
314 -}
315decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
316decodeIso8859_1 = go where
317 go p = do e <- lift (next p)
318 case e of
319 Left r -> return (return r)
320 Right (chunk, p') ->
321 if B.null chunk
322 then go p'
323 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
324 yield (T.pack (B8.unpack safe))
325 if B.null unsafe
326 then go p'
327 else return (do yield unsafe
328 p')
329
330
331