aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text/Encoding.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text/Encoding.hs')
-rw-r--r--Pipes/Text/Encoding.hs166
1 files changed, 90 insertions, 76 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs
index e6fc6bf..4311ad1 100644
--- a/Pipes/Text/Encoding.hs
+++ b/Pipes/Text/Encoding.hs
@@ -2,16 +2,18 @@
2 2
3-- | This module uses the stream decoding functions from Michael Snoyman's new 3-- | This module uses the stream decoding functions from Michael Snoyman's new
4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> 4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode>
5-- package to define decoding functions and lenses. 5-- package to define decoding functions and lenses. The exported names
6-- conflict with names in @Data.Text.Encoding@ but the module can otherwise be
7-- imported unqualified.
6 8
7module Pipes.Text.Encoding 9module Pipes.Text.Encoding
8 ( 10 (
9 -- * The Lens or Codec type 11 -- * The Lens or Codec type
10 -- $lenses 12 -- $lenses
11 Codec 13 Codec
14 , decode
12 -- * Viewing the Text in a ByteString 15 -- * Viewing the Text in a ByteString
13 -- $codecs 16 -- $codecs
14 , decode
15 , utf8 17 , utf8
16 , utf8Pure 18 , utf8Pure
17 , utf16LE 19 , utf16LE
@@ -66,8 +68,9 @@ type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
66 68
67> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) 69> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
68 70
69 is just an alias for an ordinary Prelude type. Thus you use any codec with 71 is just an alias for a Prelude type. Thus you use any particular codec with
70 the @view@ / @(^.)@ and @zoom@ functions from those libraries. 72 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries;
73 we presuppose neither since we already have access to the types they require.
71 74
72 -} 75 -}
73 76
@@ -76,11 +79,12 @@ type Codec
76 . Monad m 79 . Monad m
77 => Lens' (Producer ByteString m r) 80 => Lens' (Producer ByteString m r)
78 (Producer Text m (Producer ByteString m r)) 81 (Producer Text m (Producer ByteString m r))
79 82
80{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; 83{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries;
81 exported here for convience 84 exported here under a name appropriate to the material. All of these are
85 the same:
82 86
83> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf 87> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8
84 88
85-} 89-}
86 90
@@ -88,6 +92,85 @@ decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
88decode codec a = getConstant (codec Constant a) 92decode codec a = getConstant (codec Constant a)
89 93
90 94
95{- $codecs
96
97 Each Codec-lens looks into a byte stream that is supposed to contain text.
98 The particular \'Codec\' lenses are named in accordance with the expected
99 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
100 use @view@ / @(^.)@ -- here also called 'decode':
101
102> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
103> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
104> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
105
106 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
107 decoding functions exported here, e.g.
108
109> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
110> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
111
112 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head.
113 At any point of decoding failure, the stream of text ends and reverts to (returns)
114 the original byte stream. Thus if the first bytes are already
115 un-decodable, the whole ByteString producer will be returned, i.e.
116
117> view utf8 bytestream
118
119 will just come to the same as
120
121> return bytestream
122
123 Where there is no decoding failure, the return value of the text stream will be
124 an empty byte stream followed by its own return value. In all cases you must
125 deal with the fact that it is a /ByteString producer/ that is returned, even if
126 it can be thrown away with @Control.Monad.void@
127
128> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
129
130 @zoom@ converts a Text parser into a ByteString parser:
131
132> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
133
134 or, with the type synonymn of @Pipes.Parse@:
135
136> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
137
138 Thus we can define ByteString like this:
139
140> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
141> withNextByte = do char_ <- zoom utf8 Text.drawChar
142> byte_ <- Bytes.peekByte
143> return (char_, byte_)
144
145 Though @withNextByte@ is partly defined with a Text parser 'drawChar';
146 but it is a ByteString parser; it will return the first valid utf8-encoded
147 Char in a ByteString, whatever its length,
148 and the first byte of the next character, if they exist. Because
149 we \'draw\' one and \'peek\' at the other, the parser as a whole only
150 advances one Char's length along the bytestring, whatever that length may be.
151 See the slightly more complex example \'decode.hs\' in the
152 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall>
153 discussion of this type of byte stream parsing.
154 -}
155
156utf8 :: Codec
157utf8 = mkCodec decodeUtf8 TE.encodeUtf8
158
159utf8Pure :: Codec
160utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
161
162utf16LE :: Codec
163utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
164
165utf16BE :: Codec
166utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
167
168utf32LE :: Codec
169utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
170
171utf32BE :: Codec
172utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
173
91decodeStream :: Monad m 174decodeStream :: Monad m
92 => (B.ByteString -> DecodeResult) 175 => (B.ByteString -> DecodeResult)
93 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) 176 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
@@ -178,75 +261,6 @@ mkCodec :: (forall r m . Monad m =>
178mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) 261mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
179 262
180 263
181{- $codecs
182
183 Each codec/lens looks into a byte stream that is supposed to contain text.
184 The particular \'Codec\' lenses are named in accordance with the expected
185 encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' --
186 turns a Codec into a function:
187
188> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
189> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
190> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
191
192 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized
193 decoding functions exported here, e.g.
194
195> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
196> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
197
198 The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head.
199 At any point of decoding failure, the stream of text ends and reverts to (returns)
200 the original byte stream. Thus if the first bytes are already
201 un-decodable, the whole ByteString producer will be returned, i.e.
202
203> view utf8 bytestream
204
205 will just come to the same as
206
207> return bytestream
208
209 Where there is no decoding failure, the return value of the text stream will be
210 an empty byte stream followed by its own return value. In all cases you must
211 deal with the fact that it is a ByteString producer that is returned, even if
212 it can be thrown away with @Control.Monad.void@
213
214> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
215
216 @zoom@ converts a Text parser into a ByteString parser:
217
218> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
219>
220> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
221> withNextByte = do char_ <- zoom utf8 Text.drawChar
222> byte_ <- Bytes.peekByte
223> return (char_, byte_)
224
225 @withNextByte@ will return the first valid Char in a ByteString,
226 and the first byte of the next character, if they exists. Because
227 we \'draw\' one and \'peek\' at the other, the parser as a whole only
228 advances one Char's length along the bytestring.
229
230 -}
231
232utf8 :: Codec
233utf8 = mkCodec decodeUtf8 TE.encodeUtf8
234
235utf8Pure :: Codec
236utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
237
238utf16LE :: Codec
239utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
240
241utf16BE :: Codec
242utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
243
244utf32LE :: Codec
245utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
246
247utf32BE :: Codec
248utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
249
250 264
251{- $ascii 265{- $ascii
252 ascii and latin encodings only use a small number of the characters 'Text' 266 ascii and latin encodings only use a small number of the characters 'Text'