diff options
Diffstat (limited to 'Pipes/Text/Encoding.hs')
-rw-r--r-- | Pipes/Text/Encoding.hs | 122 |
1 files changed, 106 insertions, 16 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index e07c47e..a1a0113 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs | |||
@@ -1,16 +1,17 @@ | |||
1 | {-# LANGUAGE RankNTypes, BangPatterns #-} | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
2 | -- | | ||
3 | 2 | ||
4 | -- This module uses the stream decoding functions from the text-stream-decoding package | 3 | -- | This module uses the stream decoding functions from Michael Snoyman's new |
5 | -- to define decoding functions and lenses. | 4 | -- <http://hackage.haskell.org/package/text-stream-decode text-stream-decode> |
5 | -- package to define decoding functions and lenses. | ||
6 | 6 | ||
7 | module Pipes.Text.Encoding | 7 | module Pipes.Text.Encoding |
8 | ( | 8 | ( |
9 | -- * Lens type | 9 | -- * The Lens or Codec type |
10 | -- $lenses | 10 | -- $lenses |
11 | Codec | 11 | Codec |
12 | -- * Standard lenses for viewing Text in ByteString | 12 | -- * Viewing the Text in a ByteString |
13 | -- $codecs | 13 | -- $codecs |
14 | , decode | ||
14 | , utf8 | 15 | , utf8 |
15 | , utf8Pure | 16 | , utf8Pure |
16 | , utf16LE | 17 | , utf16LE |
@@ -18,12 +19,20 @@ module Pipes.Text.Encoding | |||
18 | , utf32LE | 19 | , utf32LE |
19 | , utf32BE | 20 | , utf32BE |
20 | -- * Non-lens decoding functions | 21 | -- * Non-lens decoding functions |
22 | -- $decoders | ||
21 | , decodeUtf8 | 23 | , decodeUtf8 |
22 | , decodeUtf8Pure | 24 | , decodeUtf8Pure |
23 | , decodeUtf16LE | 25 | , decodeUtf16LE |
24 | , decodeUtf16BE | 26 | , decodeUtf16BE |
25 | , decodeUtf32LE | 27 | , decodeUtf32LE |
26 | , decodeUtf32BE | 28 | , decodeUtf32BE |
29 | -- * Re-encoding functions | ||
30 | -- $encoders | ||
31 | , encodeUtf8 | ||
32 | , encodeUtf16LE | ||
33 | , encodeUtf16BE | ||
34 | , encodeUtf32LE | ||
35 | , encodeUtf32BE | ||
27 | -- * Functions for latin and ascii text | 36 | -- * Functions for latin and ascii text |
28 | -- $ascii | 37 | -- $ascii |
29 | , encodeAscii | 38 | , encodeAscii |
@@ -33,6 +42,7 @@ module Pipes.Text.Encoding | |||
33 | ) | 42 | ) |
34 | where | 43 | where |
35 | 44 | ||
45 | import Data.Functor.Constant (Constant(..)) | ||
36 | import Data.Char (ord) | 46 | import Data.Char (ord) |
37 | import Data.ByteString as B | 47 | import Data.ByteString as B |
38 | import Data.ByteString (ByteString) | 48 | import Data.ByteString (ByteString) |
@@ -49,16 +59,16 @@ import Pipes | |||
49 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | 59 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
50 | 60 | ||
51 | {- $lenses | 61 | {- $lenses |
52 | The 'Codec' type is just an aliased standard Prelude type. It just specializes | 62 | The 'Codec' type is a simple specializion of |
53 | the @Lens'@ type synonymn used by the standard lens libraries, @lens@ and | 63 | the @Lens'@ type synonymn used by the standard lens libraries, |
54 | @lens-families@ . You use them with | 64 | <http://hackage.haskell.org/package/lens lens> and |
55 | the @view@ or @(^.)@ and @zoom@ functions from those libraries. | 65 | <http://hackage.haskell.org/package/lens-family lens-family>. That type, |
56 | 66 | ||
57 | Each codec lens looks into a byte stream that is understood to contain text. | 67 | > type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
58 | The stream of text it 'sees' in the stream of bytes begins at its head; it ends | 68 | |
59 | by reverting to (returning) the original byte stream | 69 | is just an alias for an ordinary Prelude type. Thus you use any codec with |
60 | beginning at the point of decoding failure. Where there is no decoding failure, | 70 | the @view@ / @(^.)@ and @zoom@ functions from those libraries. |
61 | it returns an empty byte stream with its return value. | 71 | |
62 | -} | 72 | -} |
63 | 73 | ||
64 | type Codec | 74 | type Codec |
@@ -66,6 +76,17 @@ type Codec | |||
66 | . Monad m | 76 | . Monad m |
67 | => Lens' (Producer ByteString m r) | 77 | => Lens' (Producer ByteString m r) |
68 | (Producer Text m (Producer ByteString m r)) | 78 | (Producer Text m (Producer ByteString m r)) |
79 | |||
80 | {- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; | ||
81 | exported here for convience | ||
82 | |||
83 | > decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf | ||
84 | |||
85 | -} | ||
86 | |||
87 | decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b | ||
88 | decode codec a = getConstant (codec Constant a) | ||
89 | |||
69 | 90 | ||
70 | decodeStream :: Monad m | 91 | decodeStream :: Monad m |
71 | => (B.ByteString -> DecodeResult) | 92 | => (B.ByteString -> DecodeResult) |
@@ -82,7 +103,20 @@ decodeStream = loop where | |||
82 | p') | 103 | p') |
83 | {-# INLINABLE decodeStream#-} | 104 | {-# INLINABLE decodeStream#-} |
84 | 105 | ||
106 | {- $decoders | ||
107 | These are functions with the simple type: | ||
108 | |||
109 | > decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
110 | |||
111 | Thus in general | ||
112 | |||
113 | > decodeUtf8 = view utf8 | ||
114 | > decodeUtf16LE = view utf16LE | ||
85 | 115 | ||
116 | and so forth, but these forms | ||
117 | may be more convenient (and give better type errors!) where lenses are | ||
118 | not desired. | ||
119 | -} | ||
86 | 120 | ||
87 | 121 | ||
88 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 122 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
@@ -109,6 +143,34 @@ decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer | |||
109 | decodeUtf32BE = decodeStream streamUtf32BE | 143 | decodeUtf32BE = decodeStream streamUtf32BE |
110 | {-# INLINE decodeUtf32BE #-} | 144 | {-# INLINE decodeUtf32BE #-} |
111 | 145 | ||
146 | |||
147 | {- $encoders | ||
148 | These are simply defined | ||
149 | |||
150 | > encodeUtf8 = yield . TE.encodeUtf8 | ||
151 | |||
152 | They are intended for use with 'for' | ||
153 | |||
154 | > for Text.stdin encodeUtf8 :: Producer ByteString IO () | ||
155 | |||
156 | which would have the effect of | ||
157 | |||
158 | > Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8) | ||
159 | |||
160 | using the encoding functions from Data.Text.Encoding | ||
161 | -} | ||
162 | |||
163 | encodeUtf8 :: Monad m => Text -> Producer ByteString m () | ||
164 | encodeUtf8 = yield . TE.encodeUtf8 | ||
165 | encodeUtf16LE :: Monad m => Text -> Producer ByteString m () | ||
166 | encodeUtf16LE = yield . TE.encodeUtf16LE | ||
167 | encodeUtf16BE :: Monad m => Text -> Producer ByteString m () | ||
168 | encodeUtf16BE = yield . TE.encodeUtf16BE | ||
169 | encodeUtf32LE :: Monad m => Text -> Producer ByteString m () | ||
170 | encodeUtf32LE = yield . TE.encodeUtf32LE | ||
171 | encodeUtf32BE :: Monad m => Text -> Producer ByteString m () | ||
172 | encodeUtf32BE = yield . TE.encodeUtf32BE | ||
173 | |||
112 | mkCodec :: (forall r m . Monad m => | 174 | mkCodec :: (forall r m . Monad m => |
113 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | 175 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) |
114 | -> (Text -> ByteString) | 176 | -> (Text -> ByteString) |
@@ -118,11 +180,39 @@ mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |||
118 | 180 | ||
119 | {- $codecs | 181 | {- $codecs |
120 | 182 | ||
121 | The particular \'Codec\' lenses are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. | 183 | Each codec/lens looks into a byte stream that is supposed to contain text. |
184 | The particular \'Codec\' lenses are named in accordance with the expected | ||
185 | encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' -- | ||
186 | turns a Codec into a function: | ||
122 | 187 | ||
123 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 188 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
189 | > decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | ||
124 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | 190 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) |
125 | 191 | ||
192 | Uses of a codec with @view@ / @(^.)@ / 'decode' can always be replaced by the specialized | ||
193 | decoding functions exported here, e.g. | ||
194 | |||
195 | > decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
196 | > decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) | ||
197 | |||
198 | The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head. | ||
199 | At any point of decoding failure, the stream of text ends and reverts to (returns) | ||
200 | the original byte stream. Thus if the first bytes are already | ||
201 | un-decodable, the whole ByteString producer will be returned, i.e. | ||
202 | |||
203 | > view utf8 bytestream | ||
204 | |||
205 | will just come to the same as | ||
206 | |||
207 | > return bytestream | ||
208 | |||
209 | Where there is no decoding failure, the return value of the text stream will be | ||
210 | an empty byte stream followed by its own return value. In all cases you must | ||
211 | deal with the fact that it is a ByteString producer that is returned, even if | ||
212 | it can be thrown away with @Control.Monad.void@ | ||
213 | |||
214 | > void (Bytes.stdin ^. utf8) :: Producer Text IO () | ||
215 | |||
126 | @zoom@ converts a Text parser into a ByteString parser: | 216 | @zoom@ converts a Text parser into a ByteString parser: |
127 | 217 | ||
128 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | 218 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) |
@@ -165,7 +255,7 @@ utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |||
165 | -} | 255 | -} |
166 | 256 | ||
167 | 257 | ||
168 | -- 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, | 258 | -- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, |
169 | -- returning the rest of the 'Text' at the first non-ascii 'Char' | 259 | -- returning the rest of the 'Text' at the first non-ascii 'Char' |
170 | 260 | ||
171 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | 261 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |