diff options
Diffstat (limited to 'Pipes/Text')
-rw-r--r-- | Pipes/Text/Encoding.hs | 59 |
1 files changed, 41 insertions, 18 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index 5a73aa9..e242411 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs | |||
@@ -65,27 +65,48 @@ import Pipes | |||
65 | 65 | ||
66 | 66 | ||
67 | {- $usage | 67 | {- $usage |
68 | Given | 68 | Encoding is of course simple. Given |
69 | 69 | ||
70 | > text :: Producer Text IO () | 70 | > text :: Producer Text IO () |
71 | 71 | ||
72 | we can encode it with @Data.Text.Encoding@ and ordinary pipe operations: | 72 | we can encode it with @Data.Text.Encoding.encodeUtf8@ |
73 | |||
74 | > TE.encodeUtf8 :: Text -> ByteString | ||
75 | |||
76 | and ordinary pipe operations: | ||
73 | 77 | ||
74 | > text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO () | 78 | > text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO () |
75 | 79 | ||
76 | or, using this module, with | 80 | or, equivalently |
81 | |||
82 | > for text (yield . TE.encodeUtf8) | ||
83 | |||
84 | But, using this module, we might use | ||
85 | |||
86 | > encodeUtf8 :: Text -> Producer ByteString m () | ||
87 | |||
88 | to write | ||
77 | 89 | ||
78 | > for text encodeUtf8 :: Producer.ByteString IO () | 90 | > for text encodeUtf8 :: Producer.ByteString IO () |
79 | 91 | ||
80 | Given | 92 | All of the above come to the same. |
93 | |||
94 | |||
95 | Given | ||
81 | 96 | ||
82 | > bytes :: Producer ByteString Text IO () | 97 | > bytes :: Producer ByteString IO () |
83 | 98 | ||
84 | we can apply a decoding function from this module: | 99 | we can apply a decoding function from this module: |
85 | 100 | ||
86 | > decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ()) | 101 | > decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ()) |
87 | 102 | ||
88 | The Text producer ends wherever decoding first fails. Thus we can re-encode | 103 | The Text producer ends wherever decoding first fails. The un-decoded |
104 | material is returned. If we are confident it is of no interest, we can | ||
105 | write: | ||
106 | |||
107 | > void $ decodeUtf8 bytes :: Producer Text IO () | ||
108 | |||
109 | Thus we can re-encode | ||
89 | as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g. | 110 | as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g. |
90 | 111 | ||
91 | > for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ()) | 112 | > for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ()) |
@@ -96,12 +117,13 @@ import Pipes | |||
96 | -} | 117 | -} |
97 | 118 | ||
98 | {- $lenses | 119 | {- $lenses |
99 | We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@ | 120 | We get a bit more flexibility, particularly in the use of pipes-style "parsers", |
100 | that looks for text in an appropriately encoded byte stream. | 121 | if we use a lens like @utf8@ or @utf16BE@ |
122 | that focusses on the text in an appropriately encoded byte stream. | ||
101 | 123 | ||
102 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | 124 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
103 | 125 | ||
104 | is just an alias for a Prelude type. We abbreviate this further, for our use case, as | 126 | is just an alias for a Prelude type. We abbreviate this further, for our use case, as |
105 | 127 | ||
106 | > type Codec | 128 | > type Codec |
107 | > = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) | 129 | > = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) |
@@ -109,9 +131,11 @@ import Pipes | |||
109 | and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can | 131 | and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can |
110 | re-encode what they have decoded. Thus you use any particular codec with | 132 | re-encode what they have decoded. Thus you use any particular codec with |
111 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries; | 133 | the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries; |
112 | we presuppose neither <http://hackage.haskell.org/package/lens lens> | 134 | <http://hackage.haskell.org/package/lens lens>, |
113 | nor <http://hackage.haskell.org/package/lens-family lens-family> | 135 | <http://hackage.haskell.org/package/lens-family lens-family>, |
114 | since we already have access to the types they require. | 136 | <http://hackage.haskell.org/package/lens-simple lens-simple>, or one of the |
137 | and <http://hackage.haskell.org/package/microlens microlens> packages will all work | ||
138 | the same, since we already have access to the types they require. | ||
115 | 139 | ||
116 | Each decoding lens looks into a byte stream that is supposed to contain text. | 140 | Each decoding lens looks into a byte stream that is supposed to contain text. |
117 | The particular lenses are named in accordance with the expected | 141 | The particular lenses are named in accordance with the expected |
@@ -122,8 +146,7 @@ import Pipes | |||
122 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | 146 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) |
123 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | 147 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) |
124 | 148 | ||
125 | These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by | 149 | Of course, we could always do this with the specialized decoding functions, e.g. |
126 | the specialized decoding functions exported here, e.g. | ||
127 | 150 | ||
128 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 151 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
129 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | 152 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) |
@@ -161,7 +184,7 @@ import Pipes | |||
161 | 184 | ||
162 | > return (Left bad_bytestream) | 185 | > return (Left bad_bytestream) |
163 | 186 | ||
164 | @zoom@ converts a Text parser into a ByteString parser: | 187 | @zoom utf8@ converts a Text parser into a ByteString parser: |
165 | 188 | ||
166 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | 189 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) |
167 | 190 | ||
@@ -178,7 +201,7 @@ import Pipes | |||
178 | 201 | ||
179 | Though @charPlusByte@ is partly defined with a Text parser 'drawChar'; | 202 | Though @charPlusByte@ is partly defined with a Text parser 'drawChar'; |
180 | but it is a ByteString parser; it will return the first valid utf8-encoded | 203 | but it is a ByteString parser; it will return the first valid utf8-encoded |
181 | Char in a ByteString, whatever its byte-length, | 204 | Char in a ByteString, /whatever its byte-length/, |
182 | and the first byte following, if both exist. Because | 205 | and the first byte following, if both exist. Because |
183 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | 206 | we \'draw\' one and \'peek\' at the other, the parser as a whole only |
184 | advances one Char's length along the bytestring, whatever that length may be. | 207 | advances one Char's length along the bytestring, whatever that length may be. |
@@ -227,8 +250,8 @@ decode codec a = getConstant (codec Constant a) | |||
227 | 250 | ||
228 | -} | 251 | -} |
229 | 252 | ||
230 | eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r)) | 253 | eof :: (Monad m, Monad (t m), MonadTrans t) => Lens' (t m (Producer ByteString m r)) |
231 | (Producer Text m (Either (Producer ByteString m r) r)) | 254 | (t m (Either (Producer ByteString m r) r)) |
232 | eof k p0 = fmap fromEither (k (toEither p0)) where | 255 | eof k p0 = fmap fromEither (k (toEither p0)) where |
233 | 256 | ||
234 | fromEither = liftM (either id return) | 257 | fromEither = liftM (either id return) |