]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
travis apt 3
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1{-# LANGUAGE RankNTypes, BangPatterns #-}
89d80557 2
0ac0c414 3-- | This module uses the stream decoding functions from Michael Snoyman's new
4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
4ea59a8b 5-- package to define decoding functions and lenses. The exported names
6-- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7-- imported unqualified.
bbdfd305 8
9module Pipes.Text.Encoding
fafcbeb5 10 (
0ac0c414 11 -- * The Lens or Codec type
fafcbeb5 12 -- $lenses
13 Codec
4ea59a8b 14 , decode
a4913c42 15 -- * \'Viewing\' the Text in a byte stream
fafcbeb5 16 -- $codecs
bbdfd305 17 , utf8
18 , utf8Pure
19 , utf16LE
20 , utf16BE
21 , utf32LE
22 , utf32BE
fafcbeb5 23 -- * Non-lens decoding functions
0ac0c414 24 -- $decoders
89d80557 25 , decodeUtf8
26 , decodeUtf8Pure
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
0ac0c414 31 -- * Re-encoding functions
32 -- $encoders
33 , encodeUtf8
34 , encodeUtf16LE
35 , encodeUtf16BE
36 , encodeUtf32LE
37 , encodeUtf32BE
fafcbeb5 38 -- * Functions for latin and ascii text
39 -- $ascii
bbdfd305 40 , encodeAscii
41 , decodeAscii
42 , encodeIso8859_1
43 , decodeIso8859_1
44 )
45 where
46
0ac0c414 47import Data.Functor.Constant (Constant(..))
bbdfd305 48import Data.Char (ord)
49import Data.ByteString as B
50import Data.ByteString (ByteString)
bbdfd305 51import Data.ByteString.Char8 as B8
52import Data.Text (Text)
53import qualified Data.Text as T
54import qualified Data.Text.Encoding as TE
eae50557 55import qualified Data.Streaming.Text as Stream
56import Data.Streaming.Text (DecodeResult(..))
70125641 57import Control.Monad (join)
89d80557 58import Data.Word (Word8)
bbdfd305 59import Pipes
bbdfd305 60
2f4a83f8 61type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
bbdfd305 62
fafcbeb5 63{- $lenses
0ac0c414 64 The 'Codec' type is a simple specializion of
2f4a83f8 65 the @Lens'@ type synonymn used by the standard lens libraries,
0ac0c414 66 <http://hackage.haskell.org/package/lens lens> and
67 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
fafcbeb5 68
2f4a83f8 69> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
0ac0c414 70
4ea59a8b 71 is just an alias for a Prelude type. Thus you use any particular codec with
72 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
73 we presuppose neither since we already have access to the types they require.
0ac0c414 74
fafcbeb5 75 -}
76
21eb409c 77type Codec
d199072b 78 = forall m r
21eb409c 79 . Monad m
2f4a83f8 80 => Lens' (Producer ByteString m r)
d199072b 81 (Producer Text m (Producer ByteString m r))
82
0ac0c414 83{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
4ea59a8b 84 exported here under a name appropriate to the material. All of these are
85 the same:
0ac0c414 86
4ea59a8b 87> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
0ac0c414 88
89-}
90
91decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
92decode codec a = getConstant (codec Constant a)
93
bbdfd305 94
4ea59a8b 95{- $codecs
96
97 Each Codec-lens looks into a byte stream that is supposed to contain text.
98 The particular \'Codec\' lenses are named in accordance with the expected
99 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
100 use @view@ / @(^.)@ -- here also called 'decode':
101
102> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
103> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
104> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
105
106 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
107 decoding functions exported here, e.g.
108
109> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
110> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
111
112 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
113 At any point of decoding failure, the stream of text ends and reverts to (returns)
114 the original byte stream. Thus if the first bytes are already
115 un-decodable, the whole ByteString producer will be returned, i.e.
116
117> view utf8 bytestream
118
119 will just come to the same as
120
121> return bytestream
122
123 Where there is no decoding failure, the return value of the text stream will be
124 an empty byte stream followed by its own return value. In all cases you must
125 deal with the fact that it is a /ByteString producer/ that is returned, even if
126 it can be thrown away with @Control.Monad.void@
127
128> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
129
130 @zoom@ converts a Text parser into a ByteString parser:
131
132> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
133
a4913c42 134 or, using the type synonymn from @Pipes.Parse@:
4ea59a8b 135
136> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
137
a4913c42 138 Thus we can define a ByteString parser like this:
4ea59a8b 139
140> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
141> withNextByte = do char_ <- zoom utf8 Text.drawChar
142> byte_ <- Bytes.peekByte
143> return (char_, byte_)
144
145 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
146 but it is a ByteString parser; it will return the first valid utf8-encoded
147 Char in a ByteString, whatever its length,
148 and the first byte of the next character, if they exist. Because
149 we \'draw\' one and \'peek\' at the other, the parser as a whole only
150 advances one Char's length along the bytestring, whatever that length may be.
151 See the slightly more complex example \'decode.hs\' in the
152 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
153 discussion of this type of byte stream parsing.
154 -}
155
156utf8 :: Codec
157utf8 = mkCodec decodeUtf8 TE.encodeUtf8
158
159utf8Pure :: Codec
160utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
161
162utf16LE :: Codec
163utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
164
165utf16BE :: Codec
166utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
167
168utf32LE :: Codec
169utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
170
171utf32BE :: Codec
172utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
173
bbdfd305 174decodeStream :: Monad m
175 => (B.ByteString -> DecodeResult)
176 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
177decodeStream = loop where
178 loop dec0 p =
179 do x <- lift (next p)
180 case x of Left r -> return (return r)
181 Right (chunk, p') -> case dec0 chunk of
182 DecodeResultSuccess text dec -> do yield text
183 loop dec p'
184 DecodeResultFailure text bs -> do yield text
185 return (do yield bs
186 p')
187{-# INLINABLE decodeStream#-}
188
0ac0c414 189{- $decoders
190 These are functions with the simple type:
191
192> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
193
194 Thus in general
195
196> decodeUtf8 = view utf8
197> decodeUtf16LE = view utf16LE
fafcbeb5 198
0ac0c414 199 and so forth, but these forms
200 may be more convenient (and give better type errors!) where lenses are
201 not desired.
202-}
fafcbeb5 203
204
bbdfd305 205decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 206decodeUtf8 = decodeStream Stream.decodeUtf8
bbdfd305 207{-# INLINE decodeUtf8 #-}
208
209decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 210decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
bbdfd305 211{-# INLINE decodeUtf8Pure #-}
212
213decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 214decodeUtf16LE = decodeStream Stream.decodeUtf16LE
bbdfd305 215{-# INLINE decodeUtf16LE #-}
216
217decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 218decodeUtf16BE = decodeStream Stream.decodeUtf16BE
bbdfd305 219{-# INLINE decodeUtf16BE #-}
220
221decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 222decodeUtf32LE = decodeStream Stream.decodeUtf32LE
bbdfd305 223{-# INLINE decodeUtf32LE #-}
224
225decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
eae50557 226decodeUtf32BE = decodeStream Stream.decodeUtf32BE
bbdfd305 227{-# INLINE decodeUtf32BE #-}
228
0ac0c414 229
230{- $encoders
231 These are simply defined
232
233> encodeUtf8 = yield . TE.encodeUtf8
234
235 They are intended for use with 'for'
236
237> for Text.stdin encodeUtf8 :: Producer ByteString IO ()
238
239 which would have the effect of
240
241> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
242
243 using the encoding functions from Data.Text.Encoding
244-}
245
10cfd90e 246encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 247encodeUtf8 = yield . TE.encodeUtf8
10cfd90e 248encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 249encodeUtf16LE = yield . TE.encodeUtf16LE
10cfd90e 250encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 251encodeUtf16BE = yield . TE.encodeUtf16BE
10cfd90e 252encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 253encodeUtf32LE = yield . TE.encodeUtf32LE
10cfd90e 254encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
0ac0c414 255encodeUtf32BE = yield . TE.encodeUtf32BE
256
bbdfd305 257mkCodec :: (forall r m . Monad m =>
258 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
259 -> (Text -> ByteString)
260 -> Codec
261mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
262
263
bbdfd305 264
fafcbeb5 265{- $ascii
266 ascii and latin encodings only use a small number of the characters 'Text'
267 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
bbdfd305 268 Rather we simply define functions each way.
bbdfd305 269-}
270
fafcbeb5 271
0ac0c414 272-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
fafcbeb5 273-- returning the rest of the 'Text' at the first non-ascii 'Char'
274
bbdfd305 275encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
276encodeAscii = go where
277 go p = do e <- lift (next p)
278 case e of
279 Left r -> return (return r)
280 Right (chunk, p') ->
281 if T.null chunk
282 then go p'
283 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
284 in do yield (B8.pack (T.unpack safe))
285 if T.null unsafe
286 then go p'
287 else return $ do yield unsafe
288 p'
289
290{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
291 returning the rest of the 'Text' upon hitting any non-latin 'Char'
292 -}
293encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
294encodeIso8859_1 = go where
295 go p = do e <- lift (next p)
296 case e of
297 Left r -> return (return r)
298 Right (txt, p') ->
299 if T.null txt
300 then go p'
301 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
302 in do yield (B8.pack (T.unpack safe))
303 if T.null unsafe
304 then go p'
305 else return $ do yield unsafe
306 p'
307
308{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
309 unused 'ByteString' upon hitting an un-ascii byte.
310 -}
311decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
312decodeAscii = go where
313 go p = do e <- lift (next p)
314 case e of
315 Left r -> return (return r)
316 Right (chunk, p') ->
317 if B.null chunk
318 then go p'
319 else let (safe, unsafe) = B.span (<= 0x7F) chunk
320 in do yield (T.pack (B8.unpack safe))
321 if B.null unsafe
322 then go p'
323 else return (do yield unsafe
324 p')
325
326{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
327 unused 'ByteString' upon hitting the rare un-latinizable byte.
328 -}
329decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
330decodeIso8859_1 = go where
331 go p = do e <- lift (next p)
332 case e of
333 Left r -> return (return r)
334 Right (chunk, p') ->
335 if B.null chunk
336 then go p'
337 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
338 yield (T.pack (B8.unpack safe))
339 if B.null unsafe
340 then go p'
341 else return (do yield unsafe
342 p')
343
344
345