]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
Simplified `Codec` type synonym
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1
2 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- |
4
5 -- This module uses the stream decoding functions from the text-stream-decoding package
6 -- to define pipes decoding functions and lenses.
7
8 module Pipes.Text.Encoding
9 ( Codec
10 , utf8
11 , utf8Pure
12 , utf16LE
13 , utf16BE
14 , utf32LE
15 , utf32BE
16 , decodeUtf8
17 , decodeUtf8Pure
18 , decodeUtf16LE
19 , decodeUtf16BE
20 , decodeUtf32LE
21 , decodeUtf32BE
22 , encodeAscii
23 , decodeAscii
24 , encodeIso8859_1
25 , decodeIso8859_1
26 )
27 where
28
29 import Data.Char (ord)
30 import Data.ByteString as B
31 import Data.ByteString (ByteString)
32 import Data.ByteString.Char8 as B8
33 import Data.Text (Text)
34 import qualified Data.Text as T
35 import qualified Data.Text.Encoding as TE
36 import Data.Text.StreamDecoding
37 import Control.Monad (join)
38 import Data.Word (Word8)
39 import Pipes
40
41 type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
42
43 {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
44 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
45 The stream of text they 'see' in a bytestream ends by returning the original byte stream
46 beginning at the point of failure, or the empty bytestream with its return value.
47 -}
48 type Codec
49 = forall m r
50 . Monad m
51 => Lens' (Producer ByteString m r)
52 (Producer Text m (Producer ByteString m r))
53
54 decodeStream :: Monad m
55 => (B.ByteString -> DecodeResult)
56 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
57 decodeStream = loop where
58 loop dec0 p =
59 do x <- lift (next p)
60 case x of Left r -> return (return r)
61 Right (chunk, p') -> case dec0 chunk of
62 DecodeResultSuccess text dec -> do yield text
63 loop dec p'
64 DecodeResultFailure text bs -> do yield text
65 return (do yield bs
66 p')
67 {-# INLINABLE decodeStream#-}
68
69 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
70 decodeUtf8 = decodeStream streamUtf8
71 {-# INLINE decodeUtf8 #-}
72
73 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
74 decodeUtf8Pure = decodeStream streamUtf8Pure
75 {-# INLINE decodeUtf8Pure #-}
76
77 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
78 decodeUtf16LE = decodeStream streamUtf16LE
79 {-# INLINE decodeUtf16LE #-}
80
81 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
82 decodeUtf16BE = decodeStream streamUtf16BE
83 {-# INLINE decodeUtf16BE #-}
84
85 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
86 decodeUtf32LE = decodeStream streamUtf32LE
87 {-# INLINE decodeUtf32LE #-}
88
89 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
90 decodeUtf32BE = decodeStream streamUtf32BE
91 {-# INLINE decodeUtf32BE #-}
92
93 mkCodec :: (forall r m . Monad m =>
94 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
95 -> (Text -> ByteString)
96 -> Codec
97 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
98
99
100 {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
101 text stream ends by returning the original bytestream beginning at the point of failure,
102 or the empty bytestring for a well-encoded text.
103 -}
104
105 utf8 :: Codec
106 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
107
108 utf8Pure :: Codec
109 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
110
111 utf16LE :: Codec
112 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
113
114 utf16BE :: Codec
115 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
116
117 utf32LE :: Codec
118 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
119
120 utf32BE :: Codec
121 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
122
123
124 {- | ascii and latin encodings only use a small number of the characters 'Text'
125 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
126 Rather we simply define functions each way.
127
128 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
129 returning the rest of the 'Text' at the first non-ascii 'Char'
130 -}
131
132 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
133 encodeAscii = go where
134 go p = do e <- lift (next p)
135 case e of
136 Left r -> return (return r)
137 Right (chunk, p') ->
138 if T.null chunk
139 then go p'
140 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
141 in do yield (B8.pack (T.unpack safe))
142 if T.null unsafe
143 then go p'
144 else return $ do yield unsafe
145 p'
146
147 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
148 returning the rest of the 'Text' upon hitting any non-latin 'Char'
149 -}
150 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
151 encodeIso8859_1 = go where
152 go p = do e <- lift (next p)
153 case e of
154 Left r -> return (return r)
155 Right (txt, p') ->
156 if T.null txt
157 then go p'
158 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
159 in do yield (B8.pack (T.unpack safe))
160 if T.null unsafe
161 then go p'
162 else return $ do yield unsafe
163 p'
164
165 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
166 unused 'ByteString' upon hitting an un-ascii byte.
167 -}
168 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
169 decodeAscii = go where
170 go p = do e <- lift (next p)
171 case e of
172 Left r -> return (return r)
173 Right (chunk, p') ->
174 if B.null chunk
175 then go p'
176 else let (safe, unsafe) = B.span (<= 0x7F) chunk
177 in do yield (T.pack (B8.unpack safe))
178 if B.null unsafe
179 then go p'
180 else return (do yield unsafe
181 p')
182
183 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
184 unused 'ByteString' upon hitting the rare un-latinizable byte.
185 -}
186 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
187 decodeIso8859_1 = go where
188 go p = do e <- lift (next p)
189 case e of
190 Left r -> return (return r)
191 Right (chunk, p') ->
192 if B.null chunk
193 then go p'
194 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
195 yield (T.pack (B8.unpack safe))
196 if B.null unsafe
197 then go p'
198 else return (do yield unsafe
199 p')
200
201
202