diff options
Diffstat (limited to 'Pipes/Text/Encoding.hs')
-rw-r--r-- | Pipes/Text/Encoding.hs | 166 |
1 files changed, 90 insertions, 76 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index e6fc6bf..4311ad1 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs | |||
@@ -2,16 +2,18 @@ | |||
2 | 2 | ||
3 | -- | This module uses the stream decoding functions from Michael Snoyman's new | 3 | -- | This module uses the stream decoding functions from Michael Snoyman's new |
4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> | 4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> |
5 | -- package to define decoding functions and lenses. | 5 | -- package to define decoding functions and lenses. The exported names |
6 | -- conflict with names in @Data.Text.Encoding@ but the module can otherwise be | ||
7 | -- imported unqualified. | ||
6 | 8 | ||
7 | module Pipes.Text.Encoding | 9 | module Pipes.Text.Encoding |
8 | ( | 10 | ( |
9 | -- * The Lens or Codec type | 11 | -- * The Lens or Codec type |
10 | -- $lenses | 12 | -- $lenses |
11 | Codec | 13 | Codec |
14 | , decode | ||
12 | -- * Viewing the Text in a ByteString | 15 | -- * Viewing the Text in a ByteString |
13 | -- $codecs | 16 | -- $codecs |
14 | , decode | ||
15 | , utf8 | 17 | , utf8 |
16 | , utf8Pure | 18 | , utf8Pure |
17 | , utf16LE | 19 | , utf16LE |
@@ -66,8 +68,9 @@ type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | |||
66 | 68 | ||
67 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | 69 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
68 | 70 | ||
69 | is just an alias for an ordinary Prelude type. Thus you use any codec with | 71 | is just an alias for a Prelude type. Thus you use any particular codec with |
70 | the @view@ / @(^.)@ and @zoom@ functions from those libraries. | 72 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries; |
73 | we presuppose neither since we already have access to the types they require. | ||
71 | 74 | ||
72 | -} | 75 | -} |
73 | 76 | ||
@@ -76,11 +79,12 @@ type Codec | |||
76 | . Monad m | 79 | . Monad m |
77 | => Lens' (Producer ByteString m r) | 80 | => Lens' (Producer ByteString m r) |
78 | (Producer Text m (Producer ByteString m r)) | 81 | (Producer Text m (Producer ByteString m r)) |
79 | 82 | ||
80 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; | 83 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; |
81 | exported here for convience | 84 | exported here under a name appropriate to the material. All of these are |
85 | the same: | ||
82 | 86 | ||
83 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf | 87 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8 |
84 | 88 | ||
85 | -} | 89 | -} |
86 | 90 | ||
@@ -88,6 +92,85 @@ decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b | |||
88 | decode codec a = getConstant (codec Constant a) | 92 | decode codec a = getConstant (codec Constant a) |
89 | 93 | ||
90 | 94 | ||
95 | {- $codecs | ||
96 | |||
97 | Each Codec-lens looks into a byte stream that is supposed to contain text. | ||
98 | The particular \'Codec\' lenses are named in accordance with the expected | ||
99 | encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function, | ||
100 | use @view@ / @(^.)@ -- here also called 'decode': | ||
101 | |||
102 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
103 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | ||
104 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | ||
105 | |||
106 | Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized | ||
107 | decoding functions exported here, e.g. | ||
108 | |||
109 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
110 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | ||
111 | |||
112 | The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. | ||
113 | At any point of decoding failure, the stream of text ends and reverts to (returns) | ||
114 | the original byte stream. Thus if the first bytes are already | ||
115 | un-decodable, the whole ByteString producer will be returned, i.e. | ||
116 | |||
117 | > view utf8 bytestream | ||
118 | |||
119 | will just come to the same as | ||
120 | |||
121 | > return bytestream | ||
122 | |||
123 | Where there is no decoding failure, the return value of the text stream will be | ||
124 | an empty byte stream followed by its own return value. In all cases you must | ||
125 | deal with the fact that it is a /ByteString producer/ that is returned, even if | ||
126 | it can be thrown away with @Control.Monad.void@ | ||
127 | |||
128 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | ||
129 | |||
130 | @zoom@ converts a Text parser into a ByteString parser: | ||
131 | |||
132 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | ||
133 | |||
134 | or, with the type synonymn of @Pipes.Parse@: | ||
135 | |||
136 | > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) | ||
137 | |||
138 | Thus we can define ByteString like this: | ||
139 | |||
140 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | ||
141 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | ||
142 | > byte_ <- Bytes.peekByte | ||
143 | > return (char_, byte_) | ||
144 | |||
145 | Though @withNextByte@ is partly defined with a Text parser 'drawChar'; | ||
146 | but it is a ByteString parser; it will return the first valid utf8-encoded | ||
147 | Char in a ByteString, whatever its length, | ||
148 | and the first byte of the next character, if they exist. Because | ||
149 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | ||
150 | advances one Char's length along the bytestring, whatever that length may be. | ||
151 | See the slightly more complex example \'decode.hs\' in the | ||
152 | <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> | ||
153 | discussion of this type of byte stream parsing. | ||
154 | -} | ||
155 | |||
156 | utf8 :: Codec | ||
157 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | ||
158 | |||
159 | utf8Pure :: Codec | ||
160 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | ||
161 | |||
162 | utf16LE :: Codec | ||
163 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | ||
164 | |||
165 | utf16BE :: Codec | ||
166 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | ||
167 | |||
168 | utf32LE :: Codec | ||
169 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | ||
170 | |||
171 | utf32BE :: Codec | ||
172 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | ||
173 | |||
91 | decodeStream :: Monad m | 174 | decodeStream :: Monad m |
92 | => (B.ByteString -> DecodeResult) | 175 | => (B.ByteString -> DecodeResult) |
93 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 176 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
@@ -178,75 +261,6 @@ mkCodec :: (forall r m . Monad m => | |||
178 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | 261 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) |
179 | 262 | ||
180 | 263 | ||
181 | {- $codecs | ||
182 | |||
183 | Each codec/lens looks into a byte stream that is supposed to contain text. | ||
184 | The particular \'Codec\' lenses are named in accordance with the expected | ||
185 | encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' -- | ||
186 | turns a Codec into a function: | ||
187 | |||
188 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
189 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | ||
190 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | ||
191 | |||
192 | Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized | ||
193 | decoding functions exported here, e.g. | ||
194 | |||
195 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
196 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | ||
197 | |||
198 | The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head. | ||
199 | At any point of decoding failure, the stream of text ends and reverts to (returns) | ||
200 | the original byte stream. Thus if the first bytes are already | ||
201 | un-decodable, the whole ByteString producer will be returned, i.e. | ||
202 | |||
203 | > view utf8 bytestream | ||
204 | |||
205 | will just come to the same as | ||
206 | |||
207 | > return bytestream | ||
208 | |||
209 | Where there is no decoding failure, the return value of the text stream will be | ||
210 | an empty byte stream followed by its own return value. In all cases you must | ||
211 | deal with the fact that it is a ByteString producer that is returned, even if | ||
212 | it can be thrown away with @Control.Monad.void@ | ||
213 | |||
214 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | ||
215 | |||
216 | @zoom@ converts a Text parser into a ByteString parser: | ||
217 | |||
218 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | ||
219 | > | ||
220 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | ||
221 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | ||
222 | > byte_ <- Bytes.peekByte | ||
223 | > return (char_, byte_) | ||
224 | |||
225 | @withNextByte@ will return the first valid Char in a ByteString, | ||
226 | and the first byte of the next character, if they exists. Because | ||
227 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | ||
228 | advances one Char's length along the bytestring. | ||
229 | |||
230 | -} | ||
231 | |||
232 | utf8 :: Codec | ||
233 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | ||
234 | |||
235 | utf8Pure :: Codec | ||
236 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | ||
237 | |||
238 | utf16LE :: Codec | ||
239 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | ||
240 | |||
241 | utf16BE :: Codec | ||
242 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | ||
243 | |||
244 | utf32LE :: Codec | ||
245 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | ||
246 | |||
247 | utf32BE :: Codec | ||
248 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | ||
249 | |||
250 | 264 | ||
251 | {- $ascii | 265 | {- $ascii |
252 | ascii and latin encodings only use a small number of the characters 'Text' | 266 | ascii and latin encodings only use a small number of the characters 'Text' |