diff options
Diffstat (limited to 'Pipes')
-rw-r--r-- | Pipes/Text/Encoding.hs | 73 |
1 files changed, 56 insertions, 17 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index 21269cf..e07c47e 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs | |||
@@ -1,24 +1,31 @@ | |||
1 | |||
2 | {-# LANGUAGE RankNTypes, BangPatterns #-} | 1 | {-# LANGUAGE RankNTypes, BangPatterns #-} |
3 | -- | | 2 | -- | |
4 | 3 | ||
5 | -- This module uses the stream decoding functions from the text-stream-decoding package | 4 | -- This module uses the stream decoding functions from the text-stream-decoding package |
6 | -- to define pipes decoding functions and lenses. | 5 | -- to define decoding functions and lenses. |
7 | 6 | ||
8 | module Pipes.Text.Encoding | 7 | module Pipes.Text.Encoding |
9 | ( Codec | 8 | ( |
9 | -- * Lens type | ||
10 | -- $lenses | ||
11 | Codec | ||
12 | -- * Standard lenses for viewing Text in ByteString | ||
13 | -- $codecs | ||
10 | , utf8 | 14 | , utf8 |
11 | , utf8Pure | 15 | , utf8Pure |
12 | , utf16LE | 16 | , utf16LE |
13 | , utf16BE | 17 | , utf16BE |
14 | , utf32LE | 18 | , utf32LE |
15 | , utf32BE | 19 | , utf32BE |
20 | -- * Non-lens decoding functions | ||
16 | , decodeUtf8 | 21 | , decodeUtf8 |
17 | , decodeUtf8Pure | 22 | , decodeUtf8Pure |
18 | , decodeUtf16LE | 23 | , decodeUtf16LE |
19 | , decodeUtf16BE | 24 | , decodeUtf16BE |
20 | , decodeUtf32LE | 25 | , decodeUtf32LE |
21 | , decodeUtf32BE | 26 | , decodeUtf32BE |
27 | -- * Functions for latin and ascii text | ||
28 | -- $ascii | ||
22 | , encodeAscii | 29 | , encodeAscii |
23 | , decodeAscii | 30 | , decodeAscii |
24 | , encodeIso8859_1 | 31 | , encodeIso8859_1 |
@@ -38,13 +45,22 @@ import Control.Monad (join) | |||
38 | import Data.Word (Word8) | 45 | import Data.Word (Word8) |
39 | import Pipes | 46 | import Pipes |
40 | 47 | ||
48 | |||
41 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | 49 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
42 | 50 | ||
43 | {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. | 51 | {- $lenses |
44 | They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. | 52 | The 'Codec' type is just an aliased standard Prelude type. It just specializes |
45 | The stream of text they 'see' in a bytestream ends by returning the original byte stream | 53 | the @Lens'@ type synonymn used by the standard lens libraries, @lens@ and |
46 | beginning at the point of failure, or the empty bytestream with its return value. | 54 | @lens-families@ . You use them with |
47 | -} | 55 | the @view@ or @(^.)@ and @zoom@ functions from those libraries. |
56 | |||
57 | Each codec lens looks into a byte stream that is understood to contain text. | ||
58 | The stream of text it 'sees' in the stream of bytes begins at its head; it ends | ||
59 | by reverting to (returning) the original byte stream | ||
60 | beginning at the point of decoding failure. Where there is no decoding failure, | ||
61 | it returns an empty byte stream with its return value. | ||
62 | -} | ||
63 | |||
48 | type Codec | 64 | type Codec |
49 | = forall m r | 65 | = forall m r |
50 | . Monad m | 66 | . Monad m |
@@ -66,6 +82,9 @@ decodeStream = loop where | |||
66 | p') | 82 | p') |
67 | {-# INLINABLE decodeStream#-} | 83 | {-# INLINABLE decodeStream#-} |
68 | 84 | ||
85 | |||
86 | |||
87 | |||
69 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 88 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
70 | decodeUtf8 = decodeStream streamUtf8 | 89 | decodeUtf8 = decodeStream streamUtf8 |
71 | {-# INLINE decodeUtf8 #-} | 90 | {-# INLINE decodeUtf8 #-} |
@@ -97,10 +116,28 @@ mkCodec :: (forall r m . Monad m => | |||
97 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | 116 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) |
98 | 117 | ||
99 | 118 | ||
100 | {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated | 119 | {- $codecs |
101 | text stream ends by returning the original bytestream beginning at the point of failure, | 120 | |
102 | or the empty bytestring for a well-encoded text. | 121 | The particular \'Codec\' lenses are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. |
103 | -} | 122 | |
123 | > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
124 | > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) | ||
125 | |||
126 | @zoom@ converts a Text parser into a ByteString parser: | ||
127 | |||
128 | > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) | ||
129 | > | ||
130 | > withNextByte :: Parser ByteString m (Maybe Char, Maybe Word8))) | ||
131 | > withNextByte = do char_ <- zoom utf8 Text.drawChar | ||
132 | > byte_ <- Bytes.peekByte | ||
133 | > return (char_, byte_) | ||
134 | |||
135 | @withNextByte@ will return the first valid Char in a ByteString, | ||
136 | and the first byte of the next character, if they exists. Because | ||
137 | we \'draw\' one and \'peek\' at the other, the parser as a whole only | ||
138 | advances one Char's length along the bytestring. | ||
139 | |||
140 | -} | ||
104 | 141 | ||
105 | utf8 :: Codec | 142 | utf8 :: Codec |
106 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | 143 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 |
@@ -121,14 +158,16 @@ utf32BE :: Codec | |||
121 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | 158 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE |
122 | 159 | ||
123 | 160 | ||
124 | {- | ascii and latin encodings only use a small number of the characters 'Text' | 161 | {- $ascii |
125 | recognizes; thus we cannot use the pipes 'Lens' style to work with them. | 162 | ascii and latin encodings only use a small number of the characters 'Text' |
163 | recognizes; thus we cannot use the pipes @Lens@ style to work with them. | ||
126 | Rather we simply define functions each way. | 164 | Rather we simply define functions each way. |
127 | |||
128 | 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, | ||
129 | returning the rest of the 'Text' at the first non-ascii 'Char' | ||
130 | -} | 165 | -} |
131 | 166 | ||
167 | |||
168 | -- 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, | ||
169 | -- returning the rest of the 'Text' at the first non-ascii 'Char' | ||
170 | |||
132 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | 171 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
133 | encodeAscii = go where | 172 | encodeAscii = go where |
134 | go p = do e <- lift (next p) | 173 | go p = do e <- lift (next p) |