aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2015-05-04 14:16:55 -0400
committermichaelt <what_is_it_to_do_anything@yahoo.com>2015-05-04 14:16:55 -0400
commitedd8726a468112705d247312c8d7edac5c5cd04e (patch)
tree9eeed7caea7467f6328428aebf21c6f653ad68d8 /Pipes/Text
parent5e387e52ab52117a1b245508232cd04d79fff47d (diff)
downloadtext-pipes-edd8726a468112705d247312c8d7edac5c5cd04e.tar.gz
text-pipes-edd8726a468112705d247312c8d7edac5c5cd04e.tar.zst
text-pipes-edd8726a468112705d247312c8d7edac5c5cd04e.zip
encoding documentation
Diffstat (limited to 'Pipes/Text')
-rw-r--r--Pipes/Text/Encoding.hs203
1 files changed, 132 insertions, 71 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs
index 97a9c23..b6aa709 100644
--- a/Pipes/Text/Encoding.hs
+++ b/Pipes/Text/Encoding.hs
@@ -1,19 +1,25 @@
1{-# LANGUAGE RankNTypes, BangPatterns #-} 1{-# LANGUAGE RankNTypes, BangPatterns #-}
2 2
3-- | This module uses the stream decoding functions from 3-- | This module uses the stream decoding functions from
4-- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> 4-- <http://hackage.haskell.org/package/streaming-commons streaming-commons>
5-- package to define decoding functions and lenses. The exported names 5-- package to define decoding functions and lenses. The exported names
6-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@ 6-- conflict with names in @Data.Text.Encoding@ but not with the @Prelude@
7 7
8module Pipes.Text.Encoding 8module Pipes.Text.Encoding
9 ( 9 (
10 -- * The Lens or Codec type 10 -- * Decoding ByteStrings and Encoding Texts
11 -- ** Simple usage
12 -- $usage
13
14 -- ** Lens usage
11 -- $lenses 15 -- $lenses
16
17
18 -- * Basic lens operations
12 Codec 19 Codec
13 , decode 20 , decode
14 , eof 21 , eof
15 -- * \'Viewing\' the Text in a byte stream 22 -- * Decoding lenses
16 -- $codecs
17 , utf8 23 , utf8
18 , utf8Pure 24 , utf8Pure
19 , utf16LE 25 , utf16LE
@@ -58,99 +64,83 @@ import Control.Monad (join, liftM)
58import Data.Word (Word8) 64import Data.Word (Word8)
59import Pipes 65import Pipes
60 66
61type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
62 67
63{- $lenses
64 The 'Codec' type is a simple specializion of
65 the @Lens'@ type synonymn used by the standard lens libraries,
66 <http://hackage.haskell.org/package/lens lens> and
67 <http://hackage.haskell.org/package/lens-family lens-family>. That type,
68
69> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
70 68
71 is just an alias for a Prelude type. Thus you use any particular codec with 69{- $usage
72 the @view@ / @(^.)@ , @zoom@ and @over@ functions from either of those libraries; 70 Given
73 we presuppose neither library since we already have access to the types they require.
74 -}
75 71
76type Codec 72> text :: Producer Text IO ()
77 = forall m r
78 . Monad m
79 => Lens' (Producer ByteString m r)
80 (Producer Text m (Producer ByteString m r))
81 73
82{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; 74 we can encode it with @Data.Text.Encoding@ and ordinary pipe operations:
83 exported here under a name appropriate to the material. All of these are
84 the same:
85 75
86> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf8 76> text >-> P.map TE.encodeUtf8 :: Producer.ByteString IO ()
87 77
88-} 78 or, using this module, with
89 79
80> for text encodeUtf8 :: Producer.ByteString IO ()
90 81
91decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b 82 Given
92decode codec a = getConstant (codec Constant a) 83
84> bytes :: Producer ByteString Text IO ()
93 85
94{- | 'eof' tells you explicitly when decoding stops due to bad bytes or instead 86 we can apply a decoding function from this module:
95 reaches end-of-file happily. (Without it one just makes an explicit test
96 for emptiness of the resulting bytestring production using 'next')
97 Thus
98 87
99> decode (utf8 . eof) p = view (utf8 . eof) p = p^.utf8.eof 88> decodeUtf8 bytes :: Producer Text IO (Producer ByteString IO ())
100 89
101 will be a text producer. If we hit undecodable bytes, the remaining 90 The Text producer ends wherever decoding first fails. Thus we can re-encode
102 bytestring producer will be returned as a 'Left' value; 91 as uft8 as much of our byte stream as is decodeUtf16BE decodable, with, e.g.
103 in the happy case, a 'Right' value is returned with the anticipated 92
104 return value for the original bytestring producer. 93> for (decodeUtf16BE bytes) encodeUtf8 :: Producer ByteString IO (Producer ByteString IO ())
105 ) 94
95 The bytestring producer that is returned begins with where utf16BE decoding
96 failed; it it didn't fail the producer is empty.
106 97
107-} 98-}
108 99
109eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r)) 100{- $lenses
110 (Producer Text m (Either (Producer ByteString m r) r)) 101 We get a bit more flexibility, though, if we use a lens like @utf8@ or @utf16BE@
111eof k p = fmap fromEither (k (toEither p)) where 102 that looks for text in an appropriately encoded byte stream.
112 103
113 fromEither = liftM (either id return) 104> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
114 105
115 toEither pp = do p <- pp 106 is just an alias for a Prelude type. We abbreviate this further, for our use case, as
116 check p
117 107
118 check p = do e <- lift (next p) 108> type Codec
119 case e of 109> = forall m r . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
120 Left r -> return (Right r)
121 Right (bs,pb) -> if B.null bs
122 then check pb
123 else return (Left (do yield bs
124 pb))
125 110
111 and call the decoding lenses @utf8@, @utf16BE@ \"codecs\", since they can
112 re-encode what they have decoded. Thus you use any particular codec with
113 the @view@ / @(^.)@ , @zoom@ and @over@ functions from the standard lens libraries;
114 we presuppose neither <http://hackage.haskell.org/package/lens lens>
115 nor <http://hackage.haskell.org/package/lens-family lens-family>
116 since we already have access to the types they require.
126 117
127{- $codecs 118 Each decoding lens looks into a byte stream that is supposed to contain text.
128 119 The particular lenses are named in accordance with the expected
129 Each Codec-lens looks into a byte stream that is supposed to contain text. 120 encoding, 'utf8', 'utf16LE' etc. To turn a such a lens or @Codec@
130 The particular \'Codec\' lenses are named in accordance with the expected 121 into an ordinary function, use @view@ / @(^.)@ -- here also called 'decode':
131 encoding, 'utf8', 'utf16LE' etc. To turn a Codec into an ordinary function,
132 use @view@ / @(^.)@ -- here also called 'decode':
133 122
134> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) 123> view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
135> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) 124> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
136> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) 125> Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r)
137 126
138 Uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by the specialized 127 These simple uses of a codec with @view@ or @(^.)@ or 'decode' can always be replaced by
139 decoding functions exported here, e.g. 128 the specialized decoding functions exported here, e.g.
140 129
141> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) 130> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r)
142> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) 131> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r)
143 132
144 The stream of text that a @Codec@ \'sees\' in the stream of bytes begins at its head. 133 As with these functions, the stream of text that a @Codec@ \'sees\'
134 in the stream of bytes begins at its head.
145 At any point of decoding failure, the stream of text ends and reverts to (returns) 135 At any point of decoding failure, the stream of text ends and reverts to (returns)
146 the original byte stream. Thus if the first bytes are already 136 the original byte stream. Thus if the first bytes are already
147 un-decodable, the whole ByteString producer will be returned, i.e. 137 un-decodable, the whole ByteString producer will be returned, i.e.
148 138
149> view utf8 bytestream 139> view utf8 bad_bytestream
150 140
151 will just come to the same as 141 will just come to the same as
152 142
153> return bytestream 143> return bad_bytestream
154 144
155 Where there is no decoding failure, the return value of the text stream will be 145 Where there is no decoding failure, the return value of the text stream will be
156 an empty byte stream followed by its own return value. In all cases you must 146 an empty byte stream followed by its own return value. In all cases you must
@@ -158,7 +148,21 @@ eof k p = fmap fromEither (k (toEither p)) where
158 it can be thrown away with @Control.Monad.void@ 148 it can be thrown away with @Control.Monad.void@
159 149
160> void (Bytes.stdin ^. utf8) :: Producer Text IO () 150> void (Bytes.stdin ^. utf8) :: Producer Text IO ()
151
152 The @eof@ lens permits you to pattern match: if there is a Right value,
153 it is the leftover bytestring producer, if there is a Right value, it
154 is the return value of the original bytestring producer:
155
156> Bytes.stdin ^. utf8 . eof :: Producer Text IO (Either (Producer ByteString IO IO) ())
161 157
158 Thus for the stream of un-decodable bytes mentioned above,
159
160> view (utf8 . eof) bad_bytestream
161
162 will be the same as
163
164> return (Left bad_bytestream)
165
162 @zoom@ converts a Text parser into a ByteString parser: 166 @zoom@ converts a Text parser into a ByteString parser:
163 167
164> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) 168> zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char)
@@ -167,24 +171,81 @@ eof k p = fmap fromEither (k (toEither p)) where
167 171
168> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char) 172> zoom utf8 drawChar :: Monad m => Parser ByteString m (Maybe Char)
169 173
170 Thus we can define a ByteString parser like this: 174 Thus we can define a ByteString parser (in the pipes-parse sense) like this:
171 175
172> withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) 176> charPlusByte :: Parser ByteString m (Maybe Char, Maybe Word8)))
173> withNextByte = do char_ <- zoom utf8 Text.drawChar 177> charPlusByte = do char_ <- zoom utf8 Text.drawChar
174> byte_ <- Bytes.peekByte 178> byte_ <- Bytes.peekByte
175> return (char_, byte_) 179> return (char_, byte_)
176 180
177 Though @withNextByte@ is partly defined with a Text parser 'drawChar'; 181 Though @charPlusByte@ is partly defined with a Text parser 'drawChar';
178 but it is a ByteString parser; it will return the first valid utf8-encoded 182 but it is a ByteString parser; it will return the first valid utf8-encoded
179 Char in a ByteString, whatever its length, 183 Char in a ByteString, whatever its byte-length,
180 and the first byte of the next character, if they exist. Because 184 and the first byte following, if both exist. Because
181 we \'draw\' one and \'peek\' at the other, the parser as a whole only 185 we \'draw\' one and \'peek\' at the other, the parser as a whole only
182 advances one Char's length along the bytestring, whatever that length may be. 186 advances one Char's length along the bytestring, whatever that length may be.
183 See the slightly more complex example \'decode.hs\' in the 187 See the slightly more complex example \'decode.hs\' in the
184 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall> 188 <http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html#batteries-included haskellforall blog>
185 discussion of this type of byte stream parsing. 189 discussion of this type of byte stream parsing.
186 -} 190 -}
187 191
192type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
193
194type Codec
195 = forall m r
196 . Monad m
197 => Lens' (Producer ByteString m r)
198 (Producer Text m (Producer ByteString m r))
199
200
201{- | @decode@ is just the ordinary @view@ or @(^.)@ of the lens libraries;
202 exported here under a name appropriate to the material. Thus
203
204> decode utf8 bytes :: Producer Text IO (Producer ByteString IO ())
205
206 All of these are thus the same:
207
208> decode utf8 bytes = view utf8 bytes = bytes ^. utf8 = decodeUtf8 bytes
209
210
211-}
212
213decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b
214decode codec a = getConstant (codec Constant a)
215
216{- | @eof@ tells you explicitly when decoding stops due to bad bytes or
217 instead reaches end-of-file happily. (Without it one just makes an explicit
218 test for emptiness of the resulting bytestring production using next) Thus
219
220> decode (utf8 . eof) bytes :: Producer T.Text IO (Either (Producer B.ByteString IO ()) ())
221
222 If we hit undecodable bytes, the remaining bytestring producer will be
223 returned as a Left value; in the happy case, a Right value is returned
224 with the anticipated return value for the original bytestring producer.
225
226 Again, all of these are the same
227
228> decode (utf8 . eof) bytes = view (utf8 . eof) p = p^.utf8.eof
229
230-}
231
232eof :: Monad m => Lens' (Producer Text m (Producer ByteString m r))
233 (Producer Text m (Either (Producer ByteString m r) r))
234eof k p = fmap fromEither (k (toEither p)) where
235
236 fromEither = liftM (either id return)
237
238 toEither pp = do p <- pp
239 check p
240
241 check p = do e <- lift (next p)
242 case e of
243 Left r -> return (Right r)
244 Right (bs,pb) -> if B.null bs
245 then check pb
246 else return (Left (do yield bs
247 pb))
248
188utf8 :: Codec 249utf8 :: Codec
189utf8 = mkCodec decodeUtf8 TE.encodeUtf8 250utf8 = mkCodec decodeUtf8 TE.encodeUtf8
190 251