diff options
Diffstat (limited to 'Pipes/Text')
-rw-r--r-- | Pipes/Text/Encoding.hs | 203 |
1 files changed, 132 insertions, 71 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index 97a9c23..b6aa709 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs | |||
@@ -1,19 +1,25 @@ | |||
1 | {-# LANGUAGE RankNTypes, BangPatterns #-} | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
2 | 2 | ||
3 | -- | This module uses the stream decoding functions from | 3 | -- | This module uses the stream decoding functions from |
4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> | 4 | -- <http://hackage.haskell.org/package/streaming-commons streaming-commons> |
5 | -- package to define decoding functions and lenses. The exported names | 5 | -- package to define decoding functions and lenses. The exported names |
6 | -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ | 6 | -- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ |
7 | 7 | ||
8 | module Pipes.Text.Encoding | 8 | module Pipes.Text.Encoding |
9 | ( | 9 | ( |
10 | -- * The Lens or Codec type | 10 | -- * Decoding ByteStrings and Encoding Texts |
11 | -- ** Simple usage | ||
12 | -- $usage | ||
13 | |||
14 | -- ** Lens usage | ||
11 | -- $lenses | 15 | -- $lenses |
16 | |||
17 | |||
18 | -- * Basic lens operations | ||
12 | Codec | 19 | Codec |
13 | , decode | 20 | , decode |
14 | , eof | 21 | , eof |
15 | -- * \'Viewing\' the Text in a byte stream | 22 | -- * Decoding lenses |
16 | -- $codecs | ||
17 | , utf8 | 23 | , utf8 |
18 | , utf8Pure | 24 | , utf8Pure |
19 | , utf16LE | 25 | , utf16LE |
@@ -58,99 +64,83 @@ import Control.Monad (join, liftM) | |||
58 | import Data.Word (Word8) | 64 | import Data.Word (Word8) |
59 | import Pipes | 65 | import Pipes |
60 | 66 | ||
61 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | ||
62 | 67 | ||
63 | {- $lenses | ||
64 | The 'Codec' type is a simple specializion of | ||
65 | the @Lens'@ type synonymn used by the standard lens libraries, | ||
66 | <http://hackage.haskell.org/package/lens lens> and | ||
67 | <http://hackage.haskell.org/package/lens-family lens-family>. That type, | ||
68 | |||
69 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | ||
70 | 68 | ||
71 | is just an alias for a Prelude type. Thus you use any particular codec with | 69 | {- $usage |
72 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries; | 70 | Given |
73 | we presuppose neither library since we already have access to the types they require. | ||
74 | -} | ||
75 | 71 | ||
76 | type Codec | 72 | > text :: Producer Text IO () |
77 | = forall m r | ||
78 | . Monad m | ||
79 | => Lens' (Producer ByteString m r) | ||
80 | (Producer Text m (Producer ByteString m r)) | ||
81 | 73 | ||
82 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; | 74 | we can encode it with @Data.Text.Encoding@ and ordinary pipe operations: |
83 | exported here under a name appropriate to the material. All of these are | ||
84 | the same: | ||
85 | 75 | ||
86 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8 | 76 | > text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO () |
87 | 77 | ||
88 | -} | 78 | or, using this module, with |
89 | 79 | ||
80 | > for text encodeUtf8 :: Producer.ByteString IO () | ||
90 | 81 | ||
91 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b | 82 | Given |
92 | decode codec a = getConstant (codec Constant a) | 83 | |
84 | > bytes :: Producer ByteString Text IO () | ||
93 | 85 | ||
94 | {- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead | 86 | we can apply a decoding function from this module: |
95 | reaches end-of-file happily. (Without it one just makes an explicit test | ||
96 | for emptiness of the resulting bytestring production using 'next') | ||
97 | Thus | ||
98 | 87 | ||
99 | > decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof | 88 | > decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ()) |
100 | 89 | ||
101 | will be a text producer. If we hit undecodable bytes, the remaining | 90 | The Text producer ends wherever decoding first fails. Thus we can re-encode |
102 | bytestring producer will be returned as a 'Left' value; | 91 | as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g. |
103 | in the happy case, a 'Right' value is returned with the anticipated | 92 | |
104 | return value for the original bytestring producer. | 93 | > for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ()) |
105 | ) | 94 | |
95 | The bytestring producer that is returned begins with where utf16BE decoding | ||
96 | failed; it it didn't fail the producer is empty. | ||
106 | 97 | ||
107 | -} | 98 | -} |
108 | 99 | ||
109 | eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r)) | 100 | {- $lenses |
110 | (Producer Text m (Either (Producer ByteString m r) r)) | 101 | We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@ |
111 | eof k p = fmap fromEither (k (toEither p)) where | 102 | that looks for text in an appropriately encoded byte stream. |
112 | 103 | ||
113 | fromEither = liftM (either id return) | 104 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
114 | 105 | ||
115 | toEither pp = do p <- pp | 106 | is just an alias for a Prelude type. We abbreviate this further, for our use case, as |
116 | check p | ||
117 | 107 | ||
118 | check p = do e <- lift (next p) | 108 | > type Codec |
119 | case e of | 109 | > = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) |
120 | Left r -> return (Right r) | ||
121 | Right (bs,pb) -> if B.null bs | ||
122 | then check pb | ||
123 | else return (Left (do yield bs | ||
124 | pb)) | ||
125 | 110 | ||
111 | and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can | ||
112 | re-encode what they have decoded. Thus you use any particular codec with | ||
113 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries; | ||
114 | we presuppose neither <http://hackage.haskell.org/package/lens lens> | ||
115 | nor <http://hackage.haskell.org/package/lens-family lens-family> | ||
116 | since we already have access to the types they require. | ||
126 | 117 | ||
127 | {- $codecs | 118 | Each decoding lens looks into a byte stream that is supposed to contain text. |
128 | 119 | The particular lenses are named in accordance with the expected | |
129 | Each Codec-lens looks into a byte stream that is supposed to contain text. | 120 | encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@ |
130 | The particular \'Codec\' lenses are named in accordance with the expected | 121 | into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode': |
131 | encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function, | ||
132 | use @view@ / @(^.)@ -- here also called 'decode': | ||
133 | 122 | ||
134 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 123 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
135 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | 124 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) |
136 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | 125 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) |
137 | 126 | ||
138 | Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized | 127 | These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by |
139 | decoding functions exported here, e.g. | 128 | the specialized decoding functions exported here, e.g. |
140 | 129 | ||
141 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 130 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
142 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | 131 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) |
143 | 132 | ||
144 | The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. | 133 | As with these functions, the stream of text that a @Codec@ \'sees\' |
134 | in the stream of bytes begins at its head. | ||
145 | At any point of decoding failure, the stream of text ends and reverts to (returns) | 135 | At any point of decoding failure, the stream of text ends and reverts to (returns) |
146 | the original byte stream. Thus if the first bytes are already | 136 | the original byte stream. Thus if the first bytes are already |
147 | un-decodable, the whole ByteString producer will be returned, i.e. | 137 | un-decodable, the whole ByteString producer will be returned, i.e. |
148 | 138 | ||
149 | > view utf8 bytestream | 139 | > view utf8 bad_bytestream |
150 | 140 | ||
151 | will just come to the same as | 141 | will just come to the same as |
152 | 142 | ||
153 | > return bytestream | 143 | > return bad_bytestream |
154 | 144 | ||
155 | Where there is no decoding failure, the return value of the text stream will be | 145 | Where there is no decoding failure, the return value of the text stream will be |
156 | an empty byte stream followed by its own return value. In all cases you must | 146 | an empty byte stream followed by its own return value. In all cases you must |
@@ -158,7 +148,21 @@ eof k p = fmap fromEither (k (toEither p)) where | |||
158 | it can be thrown away with @Control.Monad.void@ | 148 | it can be thrown away with @Control.Monad.void@ |
159 | 149 | ||
160 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | 150 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () |
151 | |||
152 | The @eof@ lens permits you to pattern match: if there is a Right value, | ||
153 | it is the leftover bytestring producer, if there is a Right value, it | ||
154 | is the return value of the original bytestring producer: | ||
155 | |||
156 | > Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ()) | ||
161 | 157 | ||
158 | Thus for the stream of un-decodable bytes mentioned above, | ||
159 | |||
160 | > view (utf8 . eof) bad_bytestream | ||
161 | |||
162 | will be the same as | ||
163 | |||
164 | > return (Left bad_bytestream) | ||
165 | |||
162 | @zoom@ converts a Text parser into a ByteString parser: | 166 | @zoom@ converts a Text parser into a ByteString parser: |
163 | 167 | ||
164 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | 168 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) |
@@ -167,24 +171,81 @@ eof k p = fmap fromEither (k (toEither p)) where | |||
167 | 171 | ||
168 | > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) | 172 | > zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) |
169 | 173 | ||
170 | Thus we can define a ByteString parser like this: | 174 | Thus we can define a ByteString parser (in the pipes-parse sense) like this: |
171 | 175 | ||
172 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | 176 | > charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8))) |
173 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | 177 | > charPlusByte = do char_ <- zoom utf8 Text.drawChar |
174 | > byte_ <- Bytes.peekByte | 178 | > byte_ <- Bytes.peekByte |
175 | > return (char_, byte_) | 179 | > return (char_, byte_) |
176 | 180 | ||
177 | Though @withNextByte@ is partly defined with a Text parser 'drawChar'; | 181 | Though @charPlusByte@ is partly defined with a Text parser 'drawChar'; |
178 | but it is a ByteString parser; it will return the first valid utf8-encoded | 182 | but it is a ByteString parser; it will return the first valid utf8-encoded |
179 | Char in a ByteString, whatever its length, | 183 | Char in a ByteString, whatever its byte-length, |
180 | and the first byte of the next character, if they exist. Because | 184 | and the first byte following, if both exist. Because |
181 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | 185 | we \'draw\' one and \'peek\' at the other, the parser as a whole only |
182 | advances one Char's length along the bytestring, whatever that length may be. | 186 | advances one Char's length along the bytestring, whatever that length may be. |
183 | See the slightly more complex example \'decode.hs\' in the | 187 | See the slightly more complex example \'decode.hs\' in the |
184 | <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> | 188 | <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog> |
185 | discussion of this type of byte stream parsing. | 189 | discussion of this type of byte stream parsing. |
186 | -} | 190 | -} |
187 | 191 | ||
192 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | ||
193 | |||
194 | type Codec | ||
195 | = forall m r | ||
196 | . Monad m | ||
197 | => Lens' (Producer ByteString m r) | ||
198 | (Producer Text m (Producer ByteString m r)) | ||
199 | |||
200 | |||
201 | {- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries; | ||
202 | exported here under a name appropriate to the material. Thus | ||
203 | |||
204 | > decode utf8 bytes :: Producer Text IO (Producer ByteString IO ()) | ||
205 | |||
206 | All of these are thus the same: | ||
207 | |||
208 | > decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes | ||
209 | |||
210 | |||
211 | -} | ||
212 | |||
213 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b | ||
214 | decode codec a = getConstant (codec Constant a) | ||
215 | |||
216 | {- | @eof@ tells you explicitly when decoding stops due to bad bytes or | ||
217 | instead reaches end-of-file happily. (Without it one just makes an explicit | ||
218 | test for emptiness of the resulting bytestring production using next) Thus | ||
219 | |||
220 | > decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ()) | ||
221 | |||
222 | If we hit undecodable bytes, the remaining bytestring producer will be | ||
223 | returned as a Left value; in the happy case, a Right value is returned | ||
224 | with the anticipated return value for the original bytestring producer. | ||
225 | |||
226 | Again, all of these are the same | ||
227 | |||
228 | > decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof | ||
229 | |||
230 | -} | ||
231 | |||
232 | eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r)) | ||
233 | (Producer Text m (Either (Producer ByteString m r) r)) | ||
234 | eof k p = fmap fromEither (k (toEither p)) where | ||
235 | |||
236 | fromEither = liftM (either id return) | ||
237 | |||
238 | toEither pp = do p <- pp | ||
239 | check p | ||
240 | |||
241 | check p = do e <- lift (next p) | ||
242 | case e of | ||
243 | Left r -> return (Right r) | ||
244 | Right (bs,pb) -> if B.null bs | ||
245 | then check pb | ||
246 | else return (Left (do yield bs | ||
247 | pb)) | ||
248 | |||
188 | utf8 :: Codec | 249 | utf8 :: Codec |
189 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | 250 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 |
190 | 251 | ||