]>
Commit | Line | Data |
---|---|---|
bbdfd305 | 1 | |
2 | {-# LANGUAGE RankNTypes, BangPatterns #-} | |
3 | -- | | |
89d80557 | 4 | |
bbdfd305 | 5 | -- This module uses the stream decoding functions from the text-stream-decoding package |
6 | -- to define pipes decoding functions and lenses. | |
7 | ||
8 | module Pipes.Text.Encoding | |
89d80557 | 9 | ( Codec |
bbdfd305 | 10 | , utf8 |
11 | , utf8Pure | |
12 | , utf16LE | |
13 | , utf16BE | |
14 | , utf32LE | |
15 | , utf32BE | |
89d80557 | 16 | , decodeUtf8 |
17 | , decodeUtf8Pure | |
18 | , decodeUtf16LE | |
19 | , decodeUtf16BE | |
20 | , decodeUtf32LE | |
21 | , decodeUtf32BE | |
bbdfd305 | 22 | , encodeAscii |
23 | , decodeAscii | |
24 | , encodeIso8859_1 | |
25 | , decodeIso8859_1 | |
26 | ) | |
27 | where | |
28 | ||
29 | import Data.Char (ord) | |
30 | import Data.ByteString as B | |
31 | import Data.ByteString (ByteString) | |
bbdfd305 | 32 | import Data.ByteString.Char8 as B8 |
33 | import Data.Text (Text) | |
34 | import qualified Data.Text as T | |
35 | import qualified Data.Text.Encoding as TE | |
36 | import Data.Text.StreamDecoding | |
70125641 | 37 | import Control.Monad (join) |
89d80557 | 38 | import Data.Word (Word8) |
bbdfd305 | 39 | import Pipes |
bbdfd305 | 40 | |
41 | ||
42 | ||
43 | {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. | |
44 | They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. | |
45 | The stream of text they 'see' in a bytestream ends by returning the original byte stream | |
46 | beginning at the point of failure, or the empty bytestream with its return value. | |
47 | -} | |
48 | type Codec = forall f m r . (Functor f , Monad m ) => | |
49 | (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r))) | |
50 | -> Producer ByteString m r -> f (Producer ByteString m r ) | |
51 | ||
52 | decodeStream :: Monad m | |
53 | => (B.ByteString -> DecodeResult) | |
54 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
55 | decodeStream = loop where | |
56 | loop dec0 p = | |
57 | do x <- lift (next p) | |
58 | case x of Left r -> return (return r) | |
59 | Right (chunk, p') -> case dec0 chunk of | |
60 | DecodeResultSuccess text dec -> do yield text | |
61 | loop dec p' | |
62 | DecodeResultFailure text bs -> do yield text | |
63 | return (do yield bs | |
64 | p') | |
65 | {-# INLINABLE decodeStream#-} | |
66 | ||
67 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
68 | decodeUtf8 = decodeStream streamUtf8 | |
69 | {-# INLINE decodeUtf8 #-} | |
70 | ||
71 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
72 | decodeUtf8Pure = decodeStream streamUtf8Pure | |
73 | {-# INLINE decodeUtf8Pure #-} | |
74 | ||
75 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
76 | decodeUtf16LE = decodeStream streamUtf16LE | |
77 | {-# INLINE decodeUtf16LE #-} | |
78 | ||
79 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
80 | decodeUtf16BE = decodeStream streamUtf16BE | |
81 | {-# INLINE decodeUtf16BE #-} | |
82 | ||
83 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
84 | decodeUtf32LE = decodeStream streamUtf32LE | |
85 | {-# INLINE decodeUtf32LE #-} | |
86 | ||
87 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
88 | decodeUtf32BE = decodeStream streamUtf32BE | |
89 | {-# INLINE decodeUtf32BE #-} | |
90 | ||
91 | mkCodec :: (forall r m . Monad m => | |
92 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | |
93 | -> (Text -> ByteString) | |
94 | -> Codec | |
95 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | |
96 | ||
97 | ||
98 | {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated | |
99 | text stream ends by returning the original bytestream beginning at the point of failure, | |
100 | or the empty bytestring for a well-encoded text. | |
101 | -} | |
102 | ||
103 | utf8 :: Codec | |
104 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | |
105 | ||
106 | utf8Pure :: Codec | |
107 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | |
108 | ||
109 | utf16LE :: Codec | |
110 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | |
111 | ||
112 | utf16BE :: Codec | |
113 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | |
114 | ||
115 | utf32LE :: Codec | |
116 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | |
117 | ||
118 | utf32BE :: Codec | |
119 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | |
120 | ||
121 | ||
122 | {- | ascii and latin encodings only use a small number of the characters 'Text' | |
123 | recognizes; thus we cannot use the pipes 'Lens' style to work with them. | |
124 | Rather we simply define functions each way. | |
125 | ||
126 | 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, | |
127 | returning the rest of the 'Text' at the first non-ascii 'Char' | |
128 | -} | |
129 | ||
130 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
131 | encodeAscii = go where | |
132 | go p = do e <- lift (next p) | |
133 | case e of | |
134 | Left r -> return (return r) | |
135 | Right (chunk, p') -> | |
136 | if T.null chunk | |
137 | then go p' | |
138 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
139 | in do yield (B8.pack (T.unpack safe)) | |
140 | if T.null unsafe | |
141 | then go p' | |
142 | else return $ do yield unsafe | |
143 | p' | |
144 | ||
145 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | |
146 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | |
147 | -} | |
148 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | |
149 | encodeIso8859_1 = go where | |
150 | go p = do e <- lift (next p) | |
151 | case e of | |
152 | Left r -> return (return r) | |
153 | Right (txt, p') -> | |
154 | if T.null txt | |
155 | then go p' | |
156 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | |
157 | in do yield (B8.pack (T.unpack safe)) | |
158 | if T.null unsafe | |
159 | then go p' | |
160 | else return $ do yield unsafe | |
161 | p' | |
162 | ||
163 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
164 | unused 'ByteString' upon hitting an un-ascii byte. | |
165 | -} | |
166 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
167 | decodeAscii = go where | |
168 | go p = do e <- lift (next p) | |
169 | case e of | |
170 | Left r -> return (return r) | |
171 | Right (chunk, p') -> | |
172 | if B.null chunk | |
173 | then go p' | |
174 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | |
175 | in do yield (T.pack (B8.unpack safe)) | |
176 | if B.null unsafe | |
177 | then go p' | |
178 | else return (do yield unsafe | |
179 | p') | |
180 | ||
181 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | |
182 | unused 'ByteString' upon hitting the rare un-latinizable byte. | |
183 | -} | |
184 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | |
185 | decodeIso8859_1 = go where | |
186 | go p = do e <- lift (next p) | |
187 | case e of | |
188 | Left r -> return (return r) | |
189 | Right (chunk, p') -> | |
190 | if B.null chunk | |
191 | then go p' | |
192 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | |
193 | yield (T.pack (B8.unpack safe)) | |
194 | if B.null unsafe | |
195 | then go p' | |
196 | else return (do yield unsafe | |
197 | p') | |
198 | ||
199 | ||
200 |