]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
991000f57dcaeef6621b284665c8cbb2dd6293a2
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7 -- imported unqualified.
8
9 module Pipes.Text.Encoding
10 (
11 -- * The Lens or Codec type
12 -- $lenses
13 Codec
14 , decode
15 -- * \'Viewing\' the Text in a byte stream
16 -- $codecs
17 , utf8
18 , utf8Pure
19 , utf16LE
20 , utf16BE
21 , utf32LE
22 , utf32BE
23 -- * Non-lens decoding functions
24 -- $decoders
25 , decodeUtf8
26 , decodeUtf8Pure
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
31 -- * Re-encoding functions
32 -- $encoders
33 , encodeUtf8
34 , encodeUtf16LE
35 , encodeUtf16BE
36 , encodeUtf32LE
37 , encodeUtf32BE
38 -- * Functions for latin and ascii text
39 -- $ascii
40 , encodeAscii
41 , decodeAscii
42 , encodeIso8859_1
43 , decodeIso8859_1
44 , Lens'_
45 , Iso'_
46 )
47 where
48
49 import Data.Functor.Constant (Constant(..))
50 import Data.Profunctor (Profunctor)
51 import Data.Char (ord)
52 import Data.ByteString as B
53 import Data.ByteString (ByteString)
54 import Data.ByteString.Char8 as B8
55 import Data.Text (Text)
56 import qualified Data.Text as T
57 import qualified Data.Text.Encoding as TE
58 import qualified Data.Streaming.Text as Stream
59 import Data.Streaming.Text (DecodeResult(..))
60 import Control.Monad (join)
61 import Data.Word (Word8)
62 import Pipes
63
64 type Lens'_ a b = forall f . Functor f => (b -> f b) -> (a -> f a)
65 type Iso'_ a b = forall f p . (Functor f, Profunctor p) => p b (f b) -> p a (f a)
66
67 {- $lenses
68 The 'Codec' type is a simple specializion of
69 the @Lens'_@ type synonymn used by the standard lens libraries,
70 <http://hackage.haskell.org/package/lens lens> and
71 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
72
73 > type Lens'_ a b = forall f . Functor f => (b -> f b) -> (a -> f a)
74
75 is just an alias for a Prelude type. Thus you use any particular codec with
76 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
77 we presuppose neither since we already have access to the types they require.
78
79 -}
80
81 type Codec
82 = forall m r
83 . Monad m
84 => Lens'_ (Producer ByteString m r)
85 (Producer Text m (Producer ByteString m r))
86
87 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
88 exported here under a name appropriate to the material. All of these are
89 the same:
90
91 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
92
93 -}
94
95 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
96 decode codec a = getConstant (codec Constant a)
97
98
99 {- $codecs
100
101 Each Codec-lens looks into a byte stream that is supposed to contain text.
102 The particular \'Codec\' lenses are named in accordance with the expected
103 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
104 use @view@ / @(^.)@ -- here also called 'decode':
105
106 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
107 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
108 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
109
110 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
111 decoding functions exported here, e.g.
112
113 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
114 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
115
116 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
117 At any point of decoding failure, the stream of text ends and reverts to (returns)
118 the original byte stream. Thus if the first bytes are already
119 un-decodable, the whole ByteString producer will be returned, i.e.
120
121 > view utf8 bytestream
122
123 will just come to the same as
124
125 > return bytestream
126
127 Where there is no decoding failure, the return value of the text stream will be
128 an empty byte stream followed by its own return value. In all cases you must
129 deal with the fact that it is a /ByteString producer/ that is returned, even if
130 it can be thrown away with @Control.Monad.void@
131
132 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
133
134 @zoom@ converts a Text parser into a ByteString parser:
135
136 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
137
138 or, using the type synonymn from @Pipes.Parse@:
139
140 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
141
142 Thus we can define a ByteString parser like this:
143
144 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
145 > withNextByte = do char_ <- zoom utf8 Text.drawChar
146 > byte_ <- Bytes.peekByte
147 > return (char_, byte_)
148
149 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
150 but it is a ByteString parser; it will return the first valid utf8-encoded
151 Char in a ByteString, whatever its length,
152 and the first byte of the next character, if they exist. Because
153 we \'draw\' one and \'peek\' at the other, the parser as a whole only
154 advances one Char's length along the bytestring, whatever that length may be.
155 See the slightly more complex example \'decode.hs\' in the
156 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
157 discussion of this type of byte stream parsing.
158 -}
159
160 utf8 :: Codec
161 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
162
163 utf8Pure :: Codec
164 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
165
166 utf16LE :: Codec
167 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
168
169 utf16BE :: Codec
170 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
171
172 utf32LE :: Codec
173 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
174
175 utf32BE :: Codec
176 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
177
178 decodeStream :: Monad m
179 => (B.ByteString -> DecodeResult)
180 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
181 decodeStream = loop where
182 loop dec0 p =
183 do x <- lift (next p)
184 case x of Left r -> return (return r)
185 Right (chunk, p') -> case dec0 chunk of
186 DecodeResultSuccess text dec -> do yield text
187 loop dec p'
188 DecodeResultFailure text bs -> do yield text
189 return (do yield bs
190 p')
191 {-# INLINABLE decodeStream#-}
192
193 {- $decoders
194 These are functions with the simple type:
195
196 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
197
198 Thus in general
199
200 > decodeUtf8 = view utf8
201 > decodeUtf16LE = view utf16LE
202
203 and so forth, but these forms
204 may be more convenient (and give better type errors!) where lenses are
205 not desired.
206 -}
207
208
209 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
210 decodeUtf8 = decodeStream Stream.decodeUtf8
211 {-# INLINE decodeUtf8 #-}
212
213 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
214 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
215 {-# INLINE decodeUtf8Pure #-}
216
217 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
218 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
219 {-# INLINE decodeUtf16LE #-}
220
221 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
222 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
223 {-# INLINE decodeUtf16BE #-}
224
225 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
226 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
227 {-# INLINE decodeUtf32LE #-}
228
229 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
230 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
231 {-# INLINE decodeUtf32BE #-}
232
233
234 {- $encoders
235 These are simply defined
236
237 > encodeUtf8 = yield . TE.encodeUtf8
238
239 They are intended for use with 'for'
240
241 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
242
243 which would have the effect of
244
245 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
246
247 using the encoding functions from Data.Text.Encoding
248 -}
249
250 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
251 encodeUtf8 = yield . TE.encodeUtf8
252 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
253 encodeUtf16LE = yield . TE.encodeUtf16LE
254 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
255 encodeUtf16BE = yield . TE.encodeUtf16BE
256 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
257 encodeUtf32LE = yield . TE.encodeUtf32LE
258 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
259 encodeUtf32BE = yield . TE.encodeUtf32BE
260
261 mkCodec :: (forall r m . Monad m =>
262 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
263 -> (Text -> ByteString)
264 -> Codec
265 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
266
267
268
269 {- $ascii
270 ascii and latin encodings only use a small number of the characters 'Text'
271 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
272 Rather we simply define functions each way.
273 -}
274
275
276 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
277 -- returning the rest of the 'Text' at the first non-ascii 'Char'
278
279 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
280 encodeAscii = go where
281 go p = do e <- lift (next p)
282 case e of
283 Left r -> return (return r)
284 Right (chunk, p') ->
285 if T.null chunk
286 then go p'
287 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
288 in do yield (B8.pack (T.unpack safe))
289 if T.null unsafe
290 then go p'
291 else return $ do yield unsafe
292 p'
293
294 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
295 returning the rest of the 'Text' upon hitting any non-latin 'Char'
296 -}
297 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
298 encodeIso8859_1 = go where
299 go p = do e <- lift (next p)
300 case e of
301 Left r -> return (return r)
302 Right (txt, p') ->
303 if T.null txt
304 then go p'
305 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
306 in do yield (B8.pack (T.unpack safe))
307 if T.null unsafe
308 then go p'
309 else return $ do yield unsafe
310 p'
311
312 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
313 unused 'ByteString' upon hitting an un-ascii byte.
314 -}
315 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
316 decodeAscii = go where
317 go p = do e <- lift (next p)
318 case e of
319 Left r -> return (return r)
320 Right (chunk, p') ->
321 if B.null chunk
322 then go p'
323 else let (safe, unsafe) = B.span (<= 0x7F) chunk
324 in do yield (T.pack (B8.unpack safe))
325 if B.null unsafe
326 then go p'
327 else return (do yield unsafe
328 p')
329
330 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
331 unused 'ByteString' upon hitting the rare un-latinizable byte.
332 -}
333 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
334 decodeIso8859_1 = go where
335 go p = do e <- lift (next p)
336 case e of
337 Left r -> return (return r)
338 Right (chunk, p') ->
339 if B.null chunk
340 then go p'
341 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
342 yield (T.pack (B8.unpack safe))
343 if B.null unsafe
344 then go p'
345 else return (do yield unsafe
346 p')
347
348
349