]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
e07c47ec24d07fd5434678b4ae3eba95402da309
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1 {-# LANGUAGE RankNTypes, BangPatterns #-}
2 -- |
3
4 -- This module uses the stream decoding functions from the text-stream-decoding package
5 -- to define decoding functions and lenses.
6
7 module Pipes.Text.Encoding
8 (
9 -- * Lens type
10 -- $lenses
11 Codec
12 -- * Standard lenses for viewing Text in ByteString
13 -- $codecs
14 , utf8
15 , utf8Pure
16 , utf16LE
17 , utf16BE
18 , utf32LE
19 , utf32BE
20 -- * Non-lens decoding functions
21 , decodeUtf8
22 , decodeUtf8Pure
23 , decodeUtf16LE
24 , decodeUtf16BE
25 , decodeUtf32LE
26 , decodeUtf32BE
27 -- * Functions for latin and ascii text
28 -- $ascii
29 , encodeAscii
30 , decodeAscii
31 , encodeIso8859_1
32 , decodeIso8859_1
33 )
34 where
35
36 import Data.Char (ord)
37 import Data.ByteString as B
38 import Data.ByteString (ByteString)
39 import Data.ByteString.Char8 as B8
40 import Data.Text (Text)
41 import qualified Data.Text as T
42 import qualified Data.Text.Encoding as TE
43 import Data.Text.StreamDecoding
44 import Control.Monad (join)
45 import Data.Word (Word8)
46 import Pipes
47
48
49 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
50
51 {- $lenses
52 The 'Codec' type is just an aliased standard Prelude type. It just specializes
53 the @Lens'@ type synonymn used by the standard lens libraries, @lens@ and
54 @lens-families@ . You use them with
55 the @view@ or @(^.)@ and @zoom@ functions from those libraries.
56
57 Each codec lens looks into a byte stream that is understood to contain text.
58 The stream of text it 'sees' in the stream of bytes begins at its head; it ends
59 by reverting to (returning) the original byte stream
60 beginning at the point of decoding failure. Where there is no decoding failure,
61 it returns an empty byte stream with its return value.
62 -}
63
64 type Codec
65 = forall m r
66 . Monad m
67 => Lens' (Producer ByteString m r)
68 (Producer Text m (Producer ByteString m r))
69
70 decodeStream :: Monad m
71 => (B.ByteString -> DecodeResult)
72 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73 decodeStream = loop where
74 loop dec0 p =
75 do x <- lift (next p)
76 case x of Left r -> return (return r)
77 Right (chunk, p') -> case dec0 chunk of
78 DecodeResultSuccess text dec -> do yield text
79 loop dec p'
80 DecodeResultFailure text bs -> do yield text
81 return (do yield bs
82 p')
83 {-# INLINABLE decodeStream#-}
84
85
86
87
88 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89 decodeUtf8 = decodeStream streamUtf8
90 {-# INLINE decodeUtf8 #-}
91
92 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93 decodeUtf8Pure = decodeStream streamUtf8Pure
94 {-# INLINE decodeUtf8Pure #-}
95
96 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
97 decodeUtf16LE = decodeStream streamUtf16LE
98 {-# INLINE decodeUtf16LE #-}
99
100 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
101 decodeUtf16BE = decodeStream streamUtf16BE
102 {-# INLINE decodeUtf16BE #-}
103
104 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
105 decodeUtf32LE = decodeStream streamUtf32LE
106 {-# INLINE decodeUtf32LE #-}
107
108 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
109 decodeUtf32BE = decodeStream streamUtf32BE
110 {-# INLINE decodeUtf32BE #-}
111
112 mkCodec :: (forall r m . Monad m =>
113 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
114 -> (Text -> ByteString)
115 -> Codec
116 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
117
118
119 {- $codecs
120
121 The particular \'Codec\' lenses are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
122
123 > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
124 > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
125
126 @zoom@ converts a Text parser into a ByteString parser:
127
128 > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
129 >
130 > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
131 > withNextByte = do char_ <- zoom utf8 Text.drawChar
132 > byte_ <- Bytes.peekByte
133 > return (char_, byte_)
134
135 @withNextByte@ will return the first valid Char in a ByteString,
136 and the first byte of the next character, if they exists. Because
137 we \'draw\' one and \'peek\' at the other, the parser as a whole only
138 advances one Char's length along the bytestring.
139
140 -}
141
142 utf8 :: Codec
143 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
144
145 utf8Pure :: Codec
146 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
147
148 utf16LE :: Codec
149 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
150
151 utf16BE :: Codec
152 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
153
154 utf32LE :: Codec
155 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
156
157 utf32BE :: Codec
158 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
159
160
161 {- $ascii
162 ascii and latin encodings only use a small number of the characters 'Text'
163 recognizes; thus we cannot use the pipes @Lens@ style to work with them.
164 Rather we simply define functions each way.
165 -}
166
167
168 -- 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream,
169 -- returning the rest of the 'Text' at the first non-ascii 'Char'
170
171 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
172 encodeAscii = go where
173 go p = do e <- lift (next p)
174 case e of
175 Left r -> return (return r)
176 Right (chunk, p') ->
177 if T.null chunk
178 then go p'
179 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
180 in do yield (B8.pack (T.unpack safe))
181 if T.null unsafe
182 then go p'
183 else return $ do yield unsafe
184 p'
185
186 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
187 returning the rest of the 'Text' upon hitting any non-latin 'Char'
188 -}
189 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
190 encodeIso8859_1 = go where
191 go p = do e <- lift (next p)
192 case e of
193 Left r -> return (return r)
194 Right (txt, p') ->
195 if T.null txt
196 then go p'
197 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
198 in do yield (B8.pack (T.unpack safe))
199 if T.null unsafe
200 then go p'
201 else return $ do yield unsafe
202 p'
203
204 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
205 unused 'ByteString' upon hitting an un-ascii byte.
206 -}
207 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
208 decodeAscii = go where
209 go p = do e <- lift (next p)
210 case e of
211 Left r -> return (return r)
212 Right (chunk, p') ->
213 if B.null chunk
214 then go p'
215 else let (safe, unsafe) = B.span (<= 0x7F) chunk
216 in do yield (T.pack (B8.unpack safe))
217 if B.null unsafe
218 then go p'
219 else return (do yield unsafe
220 p')
221
222 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
223 unused 'ByteString' upon hitting the rare un-latinizable byte.
224 -}
225 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
226 decodeIso8859_1 = go where
227 go p = do e <- lift (next p)
228 case e of
229 Left r -> return (return r)
230 Right (chunk, p') ->
231 if B.null chunk
232 then go p'
233 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
234 yield (T.pack (B8.unpack safe))
235 if B.null unsafe
236 then go p'
237 else return (do yield unsafe
238 p')
239
240
241