]>
Commit | Line | Data |
---|---|---|
bbdfd305 | 1 | |
2 | {-# LANGUAGE RankNTypes, BangPatterns #-} | |
3 | -- | | |
4 | -- Copyright: 2014 Michael Thompson | |
5 | -- | |
6 | -- This module uses the stream decoding functions from the text-stream-decoding package | |
7 | -- to define pipes decoding functions and lenses. | |
8 | ||
9 | module Pipes.Text.Encoding | |
10 | ( DecodeResult (..) | |
11 | , Codec | |
12 | , decodeUtf8 | |
13 | , decodeUtf8Pure | |
14 | , decodeUtf16LE | |
15 | , decodeUtf16BE | |
16 | , decodeUtf32LE | |
17 | , decodeUtf32BE | |
18 | , utf8 | |
19 | , utf8Pure | |
20 | , utf16LE | |
21 | , utf16BE | |
22 | , utf32LE | |
23 | , utf32BE | |
24 | , encodeAscii | |
25 | , decodeAscii | |
26 | , encodeIso8859_1 | |
27 | , decodeIso8859_1 | |
28 | ) | |
29 | where | |
30 | ||
31 | import Data.Char (ord) | |
32 | import Data.ByteString as B | |
33 | import Data.ByteString (ByteString) | |
bbdfd305 | 34 | import Data.ByteString.Char8 as B8 |
35 | import Data.Text (Text) | |
36 | import qualified Data.Text as T | |
37 | import qualified Data.Text.Encoding as TE | |
38 | import Data.Text.StreamDecoding | |
70125641 | 39 | import Control.Monad (join) |
bbdfd305 | 40 | import Pipes |
bbdfd305 | 41 | |
42 | ||
43 | ||
44 | {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. | |
45 | They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. | |
46 | The stream of text they 'see' in a bytestream ends by returning the original byte stream | |
47 | beginning at the point of failure, or the empty bytestream with its return value. | |
48 | -} | |
49 | type Codec = forall f m r . (Functor f , Monad m ) => | |
50 | (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r))) | |
51 | -> Producer ByteString m r -> f (Producer ByteString m r ) | |
52 | ||
53 | decodeStream :: Monad m | |
54 | => (B.ByteString -> DecodeResult) | |
55 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
56 | decodeStream = loop where | |
57 | loop dec0 p = | |
58 | do x <- lift (next p) | |
59 | case x of Left r -> return (return r) | |
60 | Right (chunk, p') -> case dec0 chunk of | |
61 | DecodeResultSuccess text dec -> do yield text | |
62 | loop dec p' | |
63 | DecodeResultFailure text bs -> do yield text | |
64 | return (do yield bs | |
65 | p') | |
66 | {-# INLINABLE decodeStream#-} | |
67 | ||
68 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
69 | decodeUtf8 = decodeStream streamUtf8 | |
70 | {-# INLINE decodeUtf8 #-} | |
71 | ||
72 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
73 | decodeUtf8Pure = decodeStream streamUtf8Pure | |
74 | {-# INLINE decodeUtf8Pure #-} | |
75 | ||
76 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
77 | decodeUtf16LE = decodeStream streamUtf16LE | |
78 | {-# INLINE decodeUtf16LE #-} | |
79 | ||
80 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
81 | decodeUtf16BE = decodeStream streamUtf16BE | |
82 | {-# INLINE decodeUtf16BE #-} | |
83 | ||
84 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
85 | decodeUtf32LE = decodeStream streamUtf32LE | |
86 | {-# INLINE decodeUtf32LE #-} | |
87 | ||
88 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
89 | decodeUtf32BE = decodeStream streamUtf32BE | |
90 | {-# INLINE decodeUtf32BE #-} | |
91 | ||
92 | mkCodec :: (forall r m . Monad m => | |
93 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | |
94 | -> (Text -> ByteString) | |
95 | -> Codec | |
96 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |
97 | ||
98 | ||
99 | {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated | |
100 | text stream ends by returning the original bytestream beginning at the point of failure, | |
101 | or the empty bytestring for a well-encoded text. | |
102 | -} | |
103 | ||
104 | utf8 :: Codec | |
105 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | |
106 | ||
107 | utf8Pure :: Codec | |
108 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | |
109 | ||
110 | utf16LE :: Codec | |
111 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | |
112 | ||
113 | utf16BE :: Codec | |
114 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | |
115 | ||
116 | utf32LE :: Codec | |
117 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | |
118 | ||
119 | utf32BE :: Codec | |
120 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |
121 | ||
122 | ||
123 | {- | ascii and latin encodings only use a small number of the characters 'Text' | |
124 | recognizes; thus we cannot use the pipes 'Lens' style to work with them. | |
125 | Rather we simply define functions each way. | |
126 | ||
127 | 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, | |
128 | returning the rest of the 'Text' at the first non-ascii 'Char' | |
129 | -} | |
130 | ||
131 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
132 | encodeAscii = go where | |
133 | go p = do e <- lift (next p) | |
134 | case e of | |
135 | Left r -> return (return r) | |
136 | Right (chunk, p') -> | |
137 | if T.null chunk | |
138 | then go p' | |
139 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
140 | in do yield (B8.pack (T.unpack safe)) | |
141 | if T.null unsafe | |
142 | then go p' | |
143 | else return $ do yield unsafe | |
144 | p' | |
145 | ||
146 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | |
147 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | |
148 | -} | |
149 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
150 | encodeIso8859_1 = go where | |
151 | go p = do e <- lift (next p) | |
152 | case e of | |
153 | Left r -> return (return r) | |
154 | Right (txt, p') -> | |
155 | if T.null txt | |
156 | then go p' | |
157 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | |
158 | in do yield (B8.pack (T.unpack safe)) | |
159 | if T.null unsafe | |
160 | then go p' | |
161 | else return $ do yield unsafe | |
162 | p' | |
163 | ||
164 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
165 | unused 'ByteString' upon hitting an un-ascii byte. | |
166 | -} | |
167 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
168 | decodeAscii = go where | |
169 | go p = do e <- lift (next p) | |
170 | case e of | |
171 | Left r -> return (return r) | |
172 | Right (chunk, p') -> | |
173 | if B.null chunk | |
174 | then go p' | |
175 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | |
176 | in do yield (T.pack (B8.unpack safe)) | |
177 | if B.null unsafe | |
178 | then go p' | |
179 | else return (do yield unsafe | |
180 | p') | |
181 | ||
182 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
183 | unused 'ByteString' upon hitting the rare un-latinizable byte. | |
184 | -} | |
185 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
186 | decodeIso8859_1 = go where | |
187 | go p = do e <- lift (next p) | |
188 | case e of | |
189 | Left r -> return (return r) | |
190 | Right (chunk, p') -> | |
191 | if B.null chunk | |
192 | then go p' | |
193 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | |
194 | yield (T.pack (B8.unpack safe)) | |
195 | if B.null unsafe | |
196 | then go p' | |
197 | else return (do yield unsafe | |
198 | p') | |
199 | ||
200 | ||
201 |