]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
tutorial nonsense
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
b091cbeb 3-- | This module uses the stream decoding functions from
0ac0c414 4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
4ea59a8b 5-- package to define decoding functions and lenses. The exported names
b091cbeb 6-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
bbdfd305 7
8module Pipes.Text.Encoding
fafcbeb5 9 (
0ac0c414 10 -- * The Lens or Codec type
fafcbeb5 11 -- $lenses
12 Codec
4ea59a8b 13 , decode
6c2fffdc 14 , eof
a4913c42 15 -- * \'Viewing\' the Text in a byte stream
fafcbeb5 16 -- $codecs
bbdfd305 17 , utf8
18 , utf8Pure
19 , utf16LE
20 , utf16BE
21 , utf32LE
22 , utf32BE
fafcbeb5 23 -- * Non-lens decoding functions
0ac0c414 24 -- $decoders
89d80557 25 , decodeUtf8
26 , decodeUtf8Pure
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
0ac0c414 31 -- * Re-encoding functions
32 -- $encoders
33 , encodeUtf8
34 , encodeUtf16LE
35 , encodeUtf16BE
36 , encodeUtf32LE
37 , encodeUtf32BE
fafcbeb5 38 -- * Functions for latin and ascii text
39 -- $ascii
bbdfd305 40 , encodeAscii
41 , decodeAscii
42 , encodeIso8859_1
43 , decodeIso8859_1
44 )
45 where
46
0ac0c414 47import Data.Functor.Constant (Constant(..))
bbdfd305 48import Data.Char (ord)
49import Data.ByteString as B
50import Data.ByteString (ByteString)
bbdfd305 51import Data.ByteString.Char8 as B8
52import Data.Text (Text)
53import qualified Data.Text as T
54import qualified Data.Text.Encoding as TE
eae50557 55import qualified Data.Streaming.Text as Stream
56import Data.Streaming.Text (DecodeResult(..))
b091cbeb 57import Control.Monad (join, liftM)
89d80557 58import Data.Word (Word8)
bbdfd305 59import Pipes
bbdfd305 60
2f4a83f8 61type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
bbdfd305 62
fafcbeb5 63{- $lenses
0ac0c414 64 The 'Codec' type is a simple specializion of
2f4a83f8 65 the @Lens'@ type synonymn used by the standard lens libraries,
0ac0c414 66 <http://hackage.haskell.org/package/lens lens> and
67 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
fafcbeb5 68
2f4a83f8 69> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
0ac0c414 70
4ea59a8b 71 is just an alias for a Prelude type. Thus you use any particular codec with
72 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
b091cbeb 73 we presuppose neither library since we already have access to the types they require.
fafcbeb5 74 -}
75
21eb409c 76type Codec
d199072b 77 = forall m r
21eb409c 78 . Monad m
2f4a83f8 79 => Lens' (Producer ByteString m r)
d199072b 80 (Producer Text m (Producer ByteString m r))
81
0ac0c414 82{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
4ea59a8b 83 exported here under a name appropriate to the material. All of these are
84 the same:
0ac0c414 85
4ea59a8b 86> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
0ac0c414 87
88-}
89
6c2fffdc 90
0ac0c414 91decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
92decode codec a = getConstant (codec Constant a)
93
6c2fffdc 94{- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead
95 reaches end-of-file happily. (Without it one just makes an explicit test
96 for emptiness of the resulting bytestring production using 'next')
97 Thus
98
99> decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof
100
101 will be a text producer. If we hit undecodable bytes, the remaining
102 bytestring producer will be returned as a 'Left' value;
103 in the happy case, a 'Right' value is returned with the anticipated
104 return value for the original bytestring producer.
105 )
106
107-}
108
109eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
110 (Producer Text m (Either (Producer ByteString m r) r))
111eof k p = fmap fromEither (k (toEither p)) where
112
113 fromEither = liftM (either id return)
114
115 toEither pp = do p <- pp
116 check p
117
118 check p = do e <- lift (next p)
119 case e of
120 Left r -> return (Right r)
121 Right (bs,pb) -> if B.null bs
122 then check pb
123 else return (Left (do yield bs
124 pb))
125
bbdfd305 126
4ea59a8b 127{- $codecs
128
129 Each Codec-lens looks into a byte stream that is supposed to contain text.
130 The particular \'Codec\' lenses are named in accordance with the expected
131 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
132 use @view@ / @(^.)@ -- here also called 'decode':
133
134> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
135> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
136> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
137
138 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
139 decoding functions exported here, e.g.
140
141> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
142> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
143
144 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
145 At any point of decoding failure, the stream of text ends and reverts to (returns)
146 the original byte stream. Thus if the first bytes are already
147 un-decodable, the whole ByteString producer will be returned, i.e.
148
149> view utf8 bytestream
150
151 will just come to the same as
152
153> return bytestream
154
155 Where there is no decoding failure, the return value of the text stream will be
156 an empty byte stream followed by its own return value. In all cases you must
157 deal with the fact that it is a /ByteString producer/ that is returned, even if
158 it can be thrown away with @Control.Monad.void@
159
160> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
161
162 @zoom@ converts a Text parser into a ByteString parser:
163
164> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
165
a4913c42 166 or, using the type synonymn from @Pipes.Parse@:
4ea59a8b 167
168> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
169
a4913c42 170 Thus we can define a ByteString parser like this:
4ea59a8b 171
172> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
173> withNextByte = do char_ <- zoom utf8 Text.drawChar
174> byte_ <- Bytes.peekByte
175> return (char_, byte_)
176
177 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
178 but it is a ByteString parser; it will return the first valid utf8-encoded
179 Char in a ByteString, whatever its length,
180 and the first byte of the next character, if they exist. Because
181 we \'draw\' one and \'peek\' at the other, the parser as a whole only
182 advances one Char's length along the bytestring, whatever that length may be.
183 See the slightly more complex example \'decode.hs\' in the
184 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
185 discussion of this type of byte stream parsing.
186 -}
187
188utf8 :: Codec
189utf8 = mkCodec decodeUtf8 TE.encodeUtf8
190
191utf8Pure :: Codec
192utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
193
194utf16LE :: Codec
195utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
196
197utf16BE :: Codec
198utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
199
200utf32LE :: Codec
201utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
202
203utf32BE :: Codec
204utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
205
bbdfd305 206decodeStream :: Monad m
207 => (B.ByteString -> DecodeResult)
208 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
209decodeStream = loop where
210 loop dec0 p =
211 do x <- lift (next p)
212 case x of Left r -> return (return r)
213 Right (chunk, p') -> case dec0 chunk of
214 DecodeResultSuccess text dec -> do yield text
215 loop dec p'
216 DecodeResultFailure text bs -> do yield text
217 return (do yield bs
218 p')
219{-# INLINABLE decodeStream#-}
220
6c2fffdc 221
0ac0c414 222{- $decoders
223 These are functions with the simple type:
224
225> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
226
227 Thus in general
228
229> decodeUtf8 = view utf8
230> decodeUtf16LE = view utf16LE
fafcbeb5 231
0ac0c414 232 and so forth, but these forms
233 may be more convenient (and give better type errors!) where lenses are
234 not desired.
235-}
fafcbeb5 236
237
bbdfd305 238decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 239decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305 240{-# INLINE decodeUtf8 #-}
241
242decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 243decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305 244{-# INLINE decodeUtf8Pure #-}
245
246decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 247decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305 248{-# INLINE decodeUtf16LE #-}
249
250decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 251decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305 252{-# INLINE decodeUtf16BE #-}
253
254decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 255decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305 256{-# INLINE decodeUtf32LE #-}
257
258decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 259decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305 260{-# INLINE decodeUtf32BE #-}
261
0ac0c414 262
263{- $encoders
264 These are simply defined
265
266> encodeUtf8 = yield . TE.encodeUtf8
267
268 They are intended for use with 'for'
269
270> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
271
272 which would have the effect of
273
274> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
275
276 using the encoding functions from Data.Text.Encoding
277-}
278
10cfd90e 279encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 280encodeUtf8 = yield . TE.encodeUtf8
10cfd90e 281encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 282encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e 283encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 284encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e 285encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 286encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e 287encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 288encodeUtf32BE = yield . TE.encodeUtf32BE
289
bbdfd305 290mkCodec :: (forall r m . Monad m =>
291 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
292 -> (Text -> ByteString)
293 -> Codec
294mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
295
296
bbdfd305 297
fafcbeb5 298{- $ascii
299 ascii and latin encodings only use a small number of the characters 'Text'
300 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 301 Rather we simply define functions each way.
bbdfd305 302-}
303
fafcbeb5 304
0ac0c414 305-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 306-- returning the rest of the 'Text' at the first non-ascii 'Char'
307
bbdfd305 308encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
309encodeAscii = go where
310 go p = do e <- lift (next p)
311 case e of
312 Left r -> return (return r)
313 Right (chunk, p') ->
314 if T.null chunk
315 then go p'
316 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
317 in do yield (B8.pack (T.unpack safe))
318 if T.null unsafe
319 then go p'
320 else return $ do yield unsafe
321 p'
322
323{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
324 returning the rest of the 'Text' upon hitting any non-latin 'Char'
325 -}
326encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
327encodeIso8859_1 = go where
328 go p = do e <- lift (next p)
329 case e of
330 Left r -> return (return r)
331 Right (txt, p') ->
332 if T.null txt
333 then go p'
334 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
335 in do yield (B8.pack (T.unpack safe))
336 if T.null unsafe
337 then go p'
338 else return $ do yield unsafe
339 p'
340
341{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
342 unused 'ByteString' upon hitting an un-ascii byte.
343 -}
344decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
345decodeAscii = go where
346 go p = do e <- lift (next p)
347 case e of
348 Left r -> return (return r)
349 Right (chunk, p') ->
350 if B.null chunk
351 then go p'
352 else let (safe, unsafe) = B.span (<= 0x7F) chunk
353 in do yield (T.pack (B8.unpack safe))
354 if B.null unsafe
355 then go p'
356 else return (do yield unsafe
357 p')
358
359{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
360 unused 'ByteString' upon hitting the rare un-latinizable byte.
361 -}
362decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
363decodeIso8859_1 = go where
364 go p = do e <- lift (next p)
365 case e of
366 Left r -> return (return r)
367 Right (chunk, p') ->
368 if B.null chunk
369 then go p'
370 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
371 yield (T.pack (B8.unpack safe))
372 if B.null unsafe
373 then go p'
374 else return (do yield unsafe
375 p')
376
377
378