]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
simplify imports
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1
2 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- |
4 -- Copyright: 2014 Michael Thompson
5 --
6 -- This module uses the stream decoding functions from the text-stream-decoding package
7 -- to define pipes decoding functions and lenses.
8
9 module Pipes.Text.Encoding
10 ( DecodeResult (..)
11 , Codec
12 , decodeUtf8
13 , decodeUtf8Pure
14 , decodeUtf16LE
15 , decodeUtf16BE
16 , decodeUtf32LE
17 , decodeUtf32BE
18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
24 , encodeAscii
25 , decodeAscii
26 , encodeIso8859_1
27 , decodeIso8859_1
28 )
29 where
30
31 import Data.Char (ord)
32 import Data.ByteString as B
33 import Data.ByteString (ByteString)
34 import Data.ByteString.Char8 as B8
35 import Data.Text (Text)
36 import qualified Data.Text as T
37 import qualified Data.Text.Encoding as TE
38 import Data.Text.StreamDecoding
39 import Control.Monad (join)
40 import Pipes
41
42
43
44 {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
45 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
46 The stream of text they 'see' in a bytestream ends by returning the original byte stream
47 beginning at the point of failure, or the empty bytestream with its return value.
48 -}
49 type Codec = forall f m r . (Functor f , Monad m ) =>
50 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
51 -> Producer ByteString m r -> f (Producer ByteString m r )
52
53 decodeStream :: Monad m
54 => (B.ByteString -> DecodeResult)
55 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
56 decodeStream = loop where
57 loop dec0 p =
58 do x <- lift (next p)
59 case x of Left r -> return (return r)
60 Right (chunk, p') -> case dec0 chunk of
61 DecodeResultSuccess text dec -> do yield text
62 loop dec p'
63 DecodeResultFailure text bs -> do yield text
64 return (do yield bs
65 p')
66 {-# INLINABLE decodeStream#-}
67
68 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
69 decodeUtf8 = decodeStream streamUtf8
70 {-# INLINE decodeUtf8 #-}
71
72 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73 decodeUtf8Pure = decodeStream streamUtf8Pure
74 {-# INLINE decodeUtf8Pure #-}
75
76 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
77 decodeUtf16LE = decodeStream streamUtf16LE
78 {-# INLINE decodeUtf16LE #-}
79
80 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
81 decodeUtf16BE = decodeStream streamUtf16BE
82 {-# INLINE decodeUtf16BE #-}
83
84 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
85 decodeUtf32LE = decodeStream streamUtf32LE
86 {-# INLINE decodeUtf32LE #-}
87
88 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89 decodeUtf32BE = decodeStream streamUtf32BE
90 {-# INLINE decodeUtf32BE #-}
91
92 mkCodec :: (forall r m . Monad m =>
93 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
94 -> (Text -> ByteString)
95 -> Codec
96 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
97
98
99 {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
100 text stream ends by returning the original bytestream beginning at the point of failure,
101 or the empty bytestring for a well-encoded text.
102 -}
103
104 utf8 :: Codec
105 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
106
107 utf8Pure :: Codec
108 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
109
110 utf16LE :: Codec
111 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
112
113 utf16BE :: Codec
114 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
115
116 utf32LE :: Codec
117 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
118
119 utf32BE :: Codec
120 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
121
122
123 {- | ascii and latin encodings only use a small number of the characters 'Text'
124 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
125 Rather we simply define functions each way.
126
127 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
128 returning the rest of the 'Text' at the first non-ascii 'Char'
129 -}
130
131 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
132 encodeAscii = go where
133 go p = do e <- lift (next p)
134 case e of
135 Left r -> return (return r)
136 Right (chunk, p') ->
137 if T.null chunk
138 then go p'
139 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
140 in do yield (B8.pack (T.unpack safe))
141 if T.null unsafe
142 then go p'
143 else return $ do yield unsafe
144 p'
145
146 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
147 returning the rest of the 'Text' upon hitting any non-latin 'Char'
148 -}
149 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
150 encodeIso8859_1 = go where
151 go p = do e <- lift (next p)
152 case e of
153 Left r -> return (return r)
154 Right (txt, p') ->
155 if T.null txt
156 then go p'
157 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
158 in do yield (B8.pack (T.unpack safe))
159 if T.null unsafe
160 then go p'
161 else return $ do yield unsafe
162 p'
163
164 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
165 unused 'ByteString' upon hitting an un-ascii byte.
166 -}
167 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
168 decodeAscii = go where
169 go p = do e <- lift (next p)
170 case e of
171 Left r -> return (return r)
172 Right (chunk, p') ->
173 if B.null chunk
174 then go p'
175 else let (safe, unsafe) = B.span (<= 0x7F) chunk
176 in do yield (T.pack (B8.unpack safe))
177 if B.null unsafe
178 then go p'
179 else return (do yield unsafe
180 p')
181
182 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
183 unused 'ByteString' upon hitting the rare un-latinizable byte.
184 -}
185 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
186 decodeIso8859_1 = go where
187 go p = do e <- lift (next p)
188 case e of
189 Left r -> return (return r)
190 Right (chunk, p') ->
191 if B.null chunk
192 then go p'
193 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
194 yield (T.pack (B8.unpack safe))
195 if B.null unsafe
196 then go p'
197 else return (do yield unsafe
198 p')
199
200
201