]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
37a4aa3f8ad6dfe470110db1abc31d61fd75d8e9
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2
3 -- | This module uses the stream decoding functions from Michael Snoyman's new
4 -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5 -- package to define decoding functions and lenses. The exported names
6 -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7 -- imported unqualified.
8
9 module Pipes.Text.Encoding
10 (
11 -- * The Lens or Codec type
12 -- $lenses
13 Codec
14 , decode
15 -- * \'Viewing\' the Text in a byte stream
16 -- $codecs
17 , utf8
18 , utf8Pure
19 , utf16LE
20 , utf16BE
21 , utf32LE
22 , utf32BE
23 -- * Non-lens decoding functions
24 -- $decoders
25 , decodeUtf8
26 , decodeUtf8Pure
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
31 -- * Re-encoding functions
32 -- $encoders
33 , encodeUtf8
34 , encodeUtf16LE
35 , encodeUtf16BE
36 , encodeUtf32LE
37 , encodeUtf32BE
38 -- * Functions for latin and ascii text
39 -- $ascii
40 , encodeAscii
41 , decodeAscii
42 , encodeIso8859_1
43 , decodeIso8859_1
44 )
45 where
46
47 import Data.Functor.Constant (Constant(..))
48 import Data.Char (ord)
49 import Data.ByteString as B
50 import Data.ByteString (ByteString)
51 import Data.ByteString.Char8 as B8
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import qualified Data.Text.Encoding as TE
55 import Data.Text.StreamDecoding
56 import Control.Monad (join)
57 import Data.Word (Word8)
58 import Pipes
59
60 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
61
62 {- $lenses
63 The 'Codec' type is a simple specializion of
64 the @Lens'@ type synonymn used by the standard lens libraries,
65 <http://hackage.haskell.org/package/lens lens> and
66 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
67
68 > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
69
70 is just an alias for a Prelude type. Thus you use any particular codec with
71 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
72 we presuppose neither since we already have access to the types they require.
73
74 -}
75
76 type Codec
77 = forall m r
78 . Monad m
79 => Lens' (Producer ByteString m r)
80 (Producer Text m (Producer ByteString m r))
81
82 {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
83 exported here under a name appropriate to the material. All of these are
84 the same:
85
86 > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
87
88 -}
89
90 decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
91 decode codec a = getConstant (codec Constant a)
92
93
94 {- $codecs
95
96 Each Codec-lens looks into a byte stream that is supposed to contain text.
97 The particular \'Codec\' lenses are named in accordance with the expected
98 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
99 use @view@ / @(^.)@ -- here also called 'decode':
100
101 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
102 > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
103 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
104
105 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
106 decoding functions exported here, e.g.
107
108 > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
109 > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
110
111 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
112 At any point of decoding failure, the stream of text ends and reverts to (returns)
113 the original byte stream. Thus if the first bytes are already
114 un-decodable, the whole ByteString producer will be returned, i.e.
115
116 > view utf8 bytestream
117
118 will just come to the same as
119
120 > return bytestream
121
122 Where there is no decoding failure, the return value of the text stream will be
123 an empty byte stream followed by its own return value. In all cases you must
124 deal with the fact that it is a /ByteString producer/ that is returned, even if
125 it can be thrown away with @Control.Monad.void@
126
127 > void (Bytes.stdin ^. utf8) :: Producer Text IO ()
128
129 @zoom@ converts a Text parser into a ByteString parser:
130
131 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
132
133 or, using the type synonymn from @Pipes.Parse@:
134
135 > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
136
137 Thus we can define a ByteString parser like this:
138
139 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
140 > withNextByte = do char_ <- zoom utf8 Text.drawChar
141 > byte_ <- Bytes.peekByte
142 > return (char_, byte_)
143
144 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
145 but it is a ByteString parser; it will return the first valid utf8-encoded
146 Char in a ByteString, whatever its length,
147 and the first byte of the next character, if they exist. Because
148 we \'draw\' one and \'peek\' at the other, the parser as a whole only
149 advances one Char's length along the bytestring, whatever that length may be.
150 See the slightly more complex example \'decode.hs\' in the
151 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
152 discussion of this type of byte stream parsing.
153 -}
154
155 utf8 :: Codec
156 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
157
158 utf8Pure :: Codec
159 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
160
161 utf16LE :: Codec
162 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
163
164 utf16BE :: Codec
165 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
166
167 utf32LE :: Codec
168 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
169
170 utf32BE :: Codec
171 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
172
173 decodeStream :: Monad m
174 => (B.ByteString -> DecodeResult)
175 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
176 decodeStream = loop where
177 loop dec0 p =
178 do x <- lift (next p)
179 case x of Left r -> return (return r)
180 Right (chunk, p') -> case dec0 chunk of
181 DecodeResultSuccess text dec -> do yield text
182 loop dec p'
183 DecodeResultFailure text bs -> do yield text
184 return (do yield bs
185 p')
186 {-# INLINABLE decodeStream#-}
187
188 {- $decoders
189 These are functions with the simple type:
190
191 > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
192
193 Thus in general
194
195 > decodeUtf8 = view utf8
196 > decodeUtf16LE = view utf16LE
197
198 and so forth, but these forms
199 may be more convenient (and give better type errors!) where lenses are
200 not desired.
201 -}
202
203
204 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
205 decodeUtf8 = decodeStream streamUtf8
206 {-# INLINE decodeUtf8 #-}
207
208 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
209 decodeUtf8Pure = decodeStream streamUtf8Pure
210 {-# INLINE decodeUtf8Pure #-}
211
212 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
213 decodeUtf16LE = decodeStream streamUtf16LE
214 {-# INLINE decodeUtf16LE #-}
215
216 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
217 decodeUtf16BE = decodeStream streamUtf16BE
218 {-# INLINE decodeUtf16BE #-}
219
220 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
221 decodeUtf32LE = decodeStream streamUtf32LE
222 {-# INLINE decodeUtf32LE #-}
223
224 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
225 decodeUtf32BE = decodeStream streamUtf32BE
226 {-# INLINE decodeUtf32BE #-}
227
228
229 {- $encoders
230 These are simply defined
231
232 > encodeUtf8 = yield . TE.encodeUtf8
233
234 They are intended for use with 'for'
235
236 > for Text.stdin encodeUtf8 :: Producer ByteString IO ()
237
238 which would have the effect of
239
240 > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8)
241
242 using the encoding functions from Data.Text.Encoding
243 -}
244
245 encodeUtf8 :: Monad m => Text -> Producer' ByteString m ()
246 encodeUtf8 = yield . TE.encodeUtf8
247 encodeUtf16LE :: Monad m => Text -> Producer' ByteString m ()
248 encodeUtf16LE = yield . TE.encodeUtf16LE
249 encodeUtf16BE :: Monad m => Text -> Producer' ByteString m ()
250 encodeUtf16BE = yield . TE.encodeUtf16BE
251 encodeUtf32LE :: Monad m => Text -> Producer' ByteString m ()
252 encodeUtf32LE = yield . TE.encodeUtf32LE
253 encodeUtf32BE :: Monad m => Text -> Producer' ByteString m ()
254 encodeUtf32BE = yield . TE.encodeUtf32BE
255
256 mkCodec :: (forall r m . Monad m =>
257 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
258 -> (Text -> ByteString)
259 -> Codec
260 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
261
262
263
264 {- $ascii
265 ascii and latin encodings only use a small number of the characters 'Text'
266 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
267 Rather we simply define functions each way.
268 -}
269
270
271 -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
272 -- returning the rest of the 'Text' at the first non-ascii 'Char'
273
274 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
275 encodeAscii = go where
276 go p = do e <- lift (next p)
277 case e of
278 Left r -> return (return r)
279 Right (chunk, p') ->
280 if T.null chunk
281 then go p'
282 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
283 in do yield (B8.pack (T.unpack safe))
284 if T.null unsafe
285 then go p'
286 else return $ do yield unsafe
287 p'
288
289 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
290 returning the rest of the 'Text' upon hitting any non-latin 'Char'
291 -}
292 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
293 encodeIso8859_1 = go where
294 go p = do e <- lift (next p)
295 case e of
296 Left r -> return (return r)
297 Right (txt, p') ->
298 if T.null txt
299 then go p'
300 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
301 in do yield (B8.pack (T.unpack safe))
302 if T.null unsafe
303 then go p'
304 else return $ do yield unsafe
305 p'
306
307 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
308 unused 'ByteString' upon hitting an un-ascii byte.
309 -}
310 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
311 decodeAscii = go where
312 go p = do e <- lift (next p)
313 case e of
314 Left r -> return (return r)
315 Right (chunk, p') ->
316 if B.null chunk
317 then go p'
318 else let (safe, unsafe) = B.span (<= 0x7F) chunk
319 in do yield (T.pack (B8.unpack safe))
320 if B.null unsafe
321 then go p'
322 else return (do yield unsafe
323 p')
324
325 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
326 unused 'ByteString' upon hitting the rare un-latinizable byte.
327 -}
328 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
329 decodeIso8859_1 = go where
330 go p = do e <- lift (next p)
331 case e of
332 Left r -> return (return r)
333 Right (chunk, p') ->
334 if B.null chunk
335 then go p'
336 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
337 yield (T.pack (B8.unpack safe))
338 if B.null unsafe
339 then go p'
340 else return (do yield unsafe
341 p')
342
343
344