diff options
Diffstat (limited to 'Pipes/Text/Encoding.hs')
-rw-r--r-- | Pipes/Text/Encoding.hs | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs new file mode 100644 index 0000000..2bb5807 --- /dev/null +++ b/Pipes/Text/Encoding.hs | |||
@@ -0,0 +1,205 @@ | |||
1 | |||
2 | {-# LANGUAGE RankNTypes, BangPatterns #-} | ||
3 | -- | | ||
4 | -- Copyright: 2014 Michael Thompson | ||
5 | -- | ||
6 | -- This module uses the stream decoding functions from the text-stream-decoding package | ||
7 | -- to define pipes decoding functions and lenses. | ||
8 | |||
9 | module Pipes.Text.Encoding | ||
10 | ( DecodeResult (..) | ||
11 | , Codec | ||
12 | , decodeUtf8 | ||
13 | , decodeUtf8Pure | ||
14 | , decodeUtf16LE | ||
15 | , decodeUtf16BE | ||
16 | , decodeUtf32LE | ||
17 | , decodeUtf32BE | ||
18 | , utf8 | ||
19 | , utf8Pure | ||
20 | , utf16LE | ||
21 | , utf16BE | ||
22 | , utf32LE | ||
23 | , utf32BE | ||
24 | , encodeAscii | ||
25 | , decodeAscii | ||
26 | , encodeIso8859_1 | ||
27 | , decodeIso8859_1 | ||
28 | ) | ||
29 | where | ||
30 | |||
31 | import Data.Char (ord) | ||
32 | import Data.ByteString as B | ||
33 | import Data.ByteString (ByteString) | ||
34 | import Data.ByteString.Internal as B | ||
35 | import Data.ByteString.Char8 as B8 | ||
36 | import Data.Text (Text) | ||
37 | import qualified Data.Text as T | ||
38 | import qualified Data.Text.Encoding as TE | ||
39 | import Data.Text.StreamDecoding | ||
40 | import GHC.Word (Word8, Word32) | ||
41 | import Data.Word (Word8, Word16) | ||
42 | import Control.Monad | ||
43 | import Pipes | ||
44 | import Pipes.Core | ||
45 | |||
46 | |||
47 | |||
48 | {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. | ||
49 | They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. | ||
50 | The stream of text they 'see' in a bytestream ends by returning the original byte stream | ||
51 | beginning at the point of failure, or the empty bytestream with its return value. | ||
52 | -} | ||
53 | type Codec = forall f m r . (Functor f , Monad m ) => | ||
54 | (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r))) | ||
55 | -> Producer ByteString m r -> f (Producer ByteString m r ) | ||
56 | |||
57 | decodeStream :: Monad m | ||
58 | => (B.ByteString -> DecodeResult) | ||
59 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
60 | decodeStream = loop where | ||
61 | loop dec0 p = | ||
62 | do x <- lift (next p) | ||
63 | case x of Left r -> return (return r) | ||
64 | Right (chunk, p') -> case dec0 chunk of | ||
65 | DecodeResultSuccess text dec -> do yield text | ||
66 | loop dec p' | ||
67 | DecodeResultFailure text bs -> do yield text | ||
68 | return (do yield bs | ||
69 | p') | ||
70 | {-# INLINABLE decodeStream#-} | ||
71 | |||
72 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
73 | decodeUtf8 = decodeStream streamUtf8 | ||
74 | {-# INLINE decodeUtf8 #-} | ||
75 | |||
76 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
77 | decodeUtf8Pure = decodeStream streamUtf8Pure | ||
78 | {-# INLINE decodeUtf8Pure #-} | ||
79 | |||
80 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
81 | decodeUtf16LE = decodeStream streamUtf16LE | ||
82 | {-# INLINE decodeUtf16LE #-} | ||
83 | |||
84 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
85 | decodeUtf16BE = decodeStream streamUtf16BE | ||
86 | {-# INLINE decodeUtf16BE #-} | ||
87 | |||
88 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
89 | decodeUtf32LE = decodeStream streamUtf32LE | ||
90 | {-# INLINE decodeUtf32LE #-} | ||
91 | |||
92 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
93 | decodeUtf32BE = decodeStream streamUtf32BE | ||
94 | {-# INLINE decodeUtf32BE #-} | ||
95 | |||
96 | mkCodec :: (forall r m . Monad m => | ||
97 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | ||
98 | -> (Text -> ByteString) | ||
99 | -> Codec | ||
100 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | ||
101 | |||
102 | |||
103 | {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated | ||
104 | text stream ends by returning the original bytestream beginning at the point of failure, | ||
105 | or the empty bytestring for a well-encoded text. | ||
106 | -} | ||
107 | |||
108 | utf8 :: Codec | ||
109 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | ||
110 | |||
111 | utf8Pure :: Codec | ||
112 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | ||
113 | |||
114 | utf16LE :: Codec | ||
115 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | ||
116 | |||
117 | utf16BE :: Codec | ||
118 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | ||
119 | |||
120 | utf32LE :: Codec | ||
121 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | ||
122 | |||
123 | utf32BE :: Codec | ||
124 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | ||
125 | |||
126 | |||
127 | {- | ascii and latin encodings only use a small number of the characters 'Text' | ||
128 | recognizes; thus we cannot use the pipes 'Lens' style to work with them. | ||
129 | Rather we simply define functions each way. | ||
130 | |||
131 | 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, | ||
132 | returning the rest of the 'Text' at the first non-ascii 'Char' | ||
133 | -} | ||
134 | |||
135 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | ||
136 | encodeAscii = go where | ||
137 | go p = do e <- lift (next p) | ||
138 | case e of | ||
139 | Left r -> return (return r) | ||
140 | Right (chunk, p') -> | ||
141 | if T.null chunk | ||
142 | then go p' | ||
143 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | ||
144 | in do yield (B8.pack (T.unpack safe)) | ||
145 | if T.null unsafe | ||
146 | then go p' | ||
147 | else return $ do yield unsafe | ||
148 | p' | ||
149 | |||
150 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | ||
151 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | ||
152 | -} | ||
153 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | ||
154 | encodeIso8859_1 = go where | ||
155 | go p = do e <- lift (next p) | ||
156 | case e of | ||
157 | Left r -> return (return r) | ||
158 | Right (txt, p') -> | ||
159 | if T.null txt | ||
160 | then go p' | ||
161 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | ||
162 | in do yield (B8.pack (T.unpack safe)) | ||
163 | if T.null unsafe | ||
164 | then go p' | ||
165 | else return $ do yield unsafe | ||
166 | p' | ||
167 | |||
168 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | ||
169 | unused 'ByteString' upon hitting an un-ascii byte. | ||
170 | -} | ||
171 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
172 | decodeAscii = go where | ||
173 | go p = do e <- lift (next p) | ||
174 | case e of | ||
175 | Left r -> return (return r) | ||
176 | Right (chunk, p') -> | ||
177 | if B.null chunk | ||
178 | then go p' | ||
179 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | ||
180 | in do yield (T.pack (B8.unpack safe)) | ||
181 | if B.null unsafe | ||
182 | then go p' | ||
183 | else return (do yield unsafe | ||
184 | p') | ||
185 | |||
186 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | ||
187 | unused 'ByteString' upon hitting the rare un-latinizable byte. | ||
188 | -} | ||
189 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
190 | decodeIso8859_1 = go where | ||
191 | go p = do e <- lift (next p) | ||
192 | case e of | ||
193 | Left r -> return (return r) | ||
194 | Right (chunk, p') -> | ||
195 | if B.null chunk | ||
196 | then go p' | ||
197 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | ||
198 | yield (T.pack (B8.unpack safe)) | ||
199 | if B.null unsafe | ||
200 | then go p' | ||
201 | else return (do yield unsafe | ||
202 | p') | ||
203 | |||
204 | |||
205 | |||