]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
started generalizing lenses
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7 -- imported unqualified.
8
9 module Pipes.Text.Encoding
10 (
11 -- * The Lens or Codec type
12 -- $lenses
13 Codec
14 , decode
15 -- * \'Viewing\' the Text in a byte stream
16 -- $codecs
17 , utf8
18 , utf8Pure
19 , utf16LE
20 , utf16BE
21 , utf32LE
22 , utf32BE
23 -- * Non-lens decoding functions
24 -- $decoders
25 , decodeUtf8
26 , decodeUtf8Pure
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
31 -- * Re-encoding functions
32 -- $encoders
33 , encodeUtf8
34 , encodeUtf16LE
35 , encodeUtf16BE
36 , encodeUtf32LE
37 , encodeUtf32BE
38 -- * Functions for latin and ascii text
39 -- $ascii
40 , encodeAscii
41 , decodeAscii
42 , encodeIso8859_1
43 , decodeIso8859_1
44 , Lens'_
45 , Iso'_
46 )
47 where
48
49 import Data.Functor.Constant (Constant(..))
50 import Data.Profunctor (Profunctor)
51 import Data.Char (ord)
52 import Data.ByteString as B
53 import Data.ByteString (ByteString)
54 import Data.ByteString.Char8 as B8
55 import Data.Text (Text)
56 import qualified Data.Text as T
57 import qualified Data.Text.Encoding as TE
58 import qualified Data.Streaming.Text as Stream
59 import Data.Streaming.Text (DecodeResult(..))
60 import Control.Monad (join)
61 import Data.Word (Word8)
62 import Pipes
63
64 type Lens'_ a b = forall f . Functor f => (b -> f b) -> (a -> f a)
65 type Iso'_ a b = forall f p . (Functor f, Profunctor p) => p b (f b) -> p a (f a)
66
67 type Lens s t a b = forall f . Functor f => (a -> f b) -> (s -> f t)
68
69
70 {- $lenses
71 The 'Codec' type is a simple specializion of
72 the @Lens'_@ type synonymn used by the standard lens libraries,
73 <http://hackage.haskell.org/package/lens lens> and
74 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
75
76 > type Lens'_ a b = forall f . Functor f => (b -> f b) -> (a -> f a)
77
78 is just an alias for a Prelude type. Thus you use any particular codec with
79 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
80 we presuppose neither since we already have access to the types they require.
81
82 -}
83
84 type Codec
85 = forall m x y
86 . Monad m
87 => Lens (Producer ByteString m x)
88 (Producer ByteString m y)
89 (Producer Text m (Producer ByteString m x))
90 (Producer Text m (Producer ByteString m y))
91 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
92 exported here under a name appropriate to the material. All of these are
93 the same:
94
95 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
96
97 -}
98
99 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
100 decode codec a = getConstant (codec Constant a)
101
102
103 {- $codecs
104
105 Each Codec-lens looks into a byte stream that is supposed to contain text.
106 The particular \'Codec\' lenses are named in accordance with the expected
107 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
108 use @view@ / @(^.)@ -- here also called 'decode':
109
110 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
111 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
112 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
113
114 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
115 decoding functions exported here, e.g.
116
117 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
118 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
119
120 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
121 At any point of decoding failure, the stream of text ends and reverts to (returns)
122 the original byte stream. Thus if the first bytes are already
123 un-decodable, the whole ByteString producer will be returned, i.e.
124
125 > view utf8 bytestream
126
127 will just come to the same as
128
129 > return bytestream
130
131 Where there is no decoding failure, the return value of the text stream will be
132 an empty byte stream followed by its own return value. In all cases you must
133 deal with the fact that it is a /ByteString producer/ that is returned, even if
134 it can be thrown away with @Control.Monad.void@
135
136 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
137
138 @zoom@ converts a Text parser into a ByteString parser:
139
140 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
141
142 or, using the type synonymn from @Pipes.Parse@:
143
144 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
145
146 Thus we can define a ByteString parser like this:
147
148 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
149 > withNextByte = do char_ <- zoom utf8 Text.drawChar
150 > byte_ <- Bytes.peekByte
151 > return (char_, byte_)
152
153 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
154 but it is a ByteString parser; it will return the first valid utf8-encoded
155 Char in a ByteString, whatever its length,
156 and the first byte of the next character, if they exist. Because
157 we \'draw\' one and \'peek\' at the other, the parser as a whole only
158 advances one Char's length along the bytestring, whatever that length may be.
159 See the slightly more complex example \'decode.hs\' in the
160 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
161 discussion of this type of byte stream parsing.
162 -}
163
164 utf8 :: Codec
165 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
166
167 utf8Pure :: Codec
168 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
169
170 utf16LE :: Codec
171 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
172
173 utf16BE :: Codec
174 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
175
176 utf32LE :: Codec
177 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
178
179 utf32BE :: Codec
180 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
181
182 decodeStream :: Monad m
183 => (B.ByteString -> DecodeResult)
184 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
185 decodeStream = loop where
186 loop dec0 p =
187 do x <- lift (next p)
188 case x of Left r -> return (return r)
189 Right (chunk, p') -> case dec0 chunk of
190 DecodeResultSuccess text dec -> do yield text
191 loop dec p'
192 DecodeResultFailure text bs -> do yield text
193 return (do yield bs
194 p')
195 {-# INLINABLE decodeStream#-}
196
197 {- $decoders
198 These are functions with the simple type:
199
200 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
201
202 Thus in general
203
204 > decodeUtf8 = view utf8
205 > decodeUtf16LE = view utf16LE
206
207 and so forth, but these forms
208 may be more convenient (and give better type errors!) where lenses are
209 not desired.
210 -}
211
212
213 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
214 decodeUtf8 = decodeStream Stream.decodeUtf8
215 {-# INLINE decodeUtf8 #-}
216
217 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
218 decodeUtf8Pure = decodeStream Stream.decodeUtf8Pure
219 {-# INLINE decodeUtf8Pure #-}
220
221 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
222 decodeUtf16LE = decodeStream Stream.decodeUtf16LE
223 {-# INLINE decodeUtf16LE #-}
224
225 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
226 decodeUtf16BE = decodeStream Stream.decodeUtf16BE
227 {-# INLINE decodeUtf16BE #-}
228
229 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
230 decodeUtf32LE = decodeStream Stream.decodeUtf32LE
231 {-# INLINE decodeUtf32LE #-}
232
233 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
234 decodeUtf32BE = decodeStream Stream.decodeUtf32BE
235 {-# INLINE decodeUtf32BE #-}
236
237
238 {- $encoders
239 These are simply defined
240
241 > encodeUtf8 = yield . TE.encodeUtf8
242
243 They are intended for use with 'for'
244
245 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
246
247 which would have the effect of
248
249 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
250
251 using the encoding functions from Data.Text.Encoding
252 -}
253
254 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
255 encodeUtf8 = yield . TE.encodeUtf8
256 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
257 encodeUtf16LE = yield . TE.encodeUtf16LE
258 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
259 encodeUtf16BE = yield . TE.encodeUtf16BE
260 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
261 encodeUtf32LE = yield . TE.encodeUtf32LE
262 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
263 encodeUtf32BE = yield . TE.encodeUtf32BE
264
265 mkCodec :: (forall r m . Monad m =>
266 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
267 -> (Text -> ByteString)
268 -> Codec
269 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
270
271
272
273 {- $ascii
274 ascii and latin encodings only use a small number of the characters 'Text'
275 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
276 Rather we simply define functions each way.
277 -}
278
279
280 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
281 -- returning the rest of the 'Text' at the first non-ascii 'Char'
282
283 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
284 encodeAscii = go where
285 go p = do e <- lift (next p)
286 case e of
287 Left r -> return (return r)
288 Right (chunk, p') ->
289 if T.null chunk
290 then go p'
291 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
292 in do yield (B8.pack (T.unpack safe))
293 if T.null unsafe
294 then go p'
295 else return $ do yield unsafe
296 p'
297
298 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
299 returning the rest of the 'Text' upon hitting any non-latin 'Char'
300 -}
301 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
302 encodeIso8859_1 = go where
303 go p = do e <- lift (next p)
304 case e of
305 Left r -> return (return r)
306 Right (txt, p') ->
307 if T.null txt
308 then go p'
309 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
310 in do yield (B8.pack (T.unpack safe))
311 if T.null unsafe
312 then go p'
313 else return $ do yield unsafe
314 p'
315
316 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
317 unused 'ByteString' upon hitting an un-ascii byte.
318 -}
319 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
320 decodeAscii = go where
321 go p = do e <- lift (next p)
322 case e of
323 Left r -> return (return r)
324 Right (chunk, p') ->
325 if B.null chunk
326 then go p'
327 else let (safe, unsafe) = B.span (<= 0x7F) chunk
328 in do yield (T.pack (B8.unpack safe))
329 if B.null unsafe
330 then go p'
331 else return (do yield unsafe
332 p')
333
334 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
335 unused 'ByteString' upon hitting the rare un-latinizable byte.
336 -}
337 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
338 decodeIso8859_1 = go where
339 go p = do e <- lift (next p)
340 case e of
341 Left r -> return (return r)
342 Right (chunk, p') ->
343 if B.null chunk
344 then go p'
345 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
346 yield (T.pack (B8.unpack safe))
347 if B.null unsafe
348 then go p'
349 else return (do yield unsafe
350 p')
351
352
353