]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Encoding.hs
use new text-stream-decoding
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
1
2 {-# LANGUAGE RankNTypes, BangPatterns #-}
3 -- |
4 -- Copyright: 2014 Michael Thompson
5 --
6 -- This module uses the stream decoding functions from the text-stream-decoding package
7 -- to define pipes decoding functions and lenses.
8
9 module Pipes.Text.Encoding
10 ( DecodeResult (..)
11 , Codec
12 , decodeUtf8
13 , decodeUtf8Pure
14 , decodeUtf16LE
15 , decodeUtf16BE
16 , decodeUtf32LE
17 , decodeUtf32BE
18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
24 , encodeAscii
25 , decodeAscii
26 , encodeIso8859_1
27 , decodeIso8859_1
28 )
29 where
30
31 import Data.Char (ord)
32 import Data.ByteString as B
33 import Data.ByteString (ByteString)
34 import Data.ByteString.Internal as B
35 import Data.ByteString.Char8 as B8
36 import Data.Text (Text)
37 import qualified Data.Text as T
38 import qualified Data.Text.Encoding as TE
39 import Data.Text.StreamDecoding
40 import GHC.Word (Word8, Word32)
41 import Data.Word (Word8, Word16)
42 import Control.Monad
43 import Pipes
44 import Pipes.Core
45
46
47
48 {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
49 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
50 The stream of text they 'see' in a bytestream ends by returning the original byte stream
51 beginning at the point of failure, or the empty bytestream with its return value.
52 -}
53 type Codec = forall f m r . (Functor f , Monad m ) =>
54 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
55 -> Producer ByteString m r -> f (Producer ByteString m r )
56
57 decodeStream :: Monad m
58 => (B.ByteString -> DecodeResult)
59 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
60 decodeStream = loop where
61 loop dec0 p =
62 do x <- lift (next p)
63 case x of Left r -> return (return r)
64 Right (chunk, p') -> case dec0 chunk of
65 DecodeResultSuccess text dec -> do yield text
66 loop dec p'
67 DecodeResultFailure text bs -> do yield text
68 return (do yield bs
69 p')
70 {-# INLINABLE decodeStream#-}
71
72 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73 decodeUtf8 = decodeStream streamUtf8
74 {-# INLINE decodeUtf8 #-}
75
76 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
77 decodeUtf8Pure = decodeStream streamUtf8Pure
78 {-# INLINE decodeUtf8Pure #-}
79
80 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
81 decodeUtf16LE = decodeStream streamUtf16LE
82 {-# INLINE decodeUtf16LE #-}
83
84 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
85 decodeUtf16BE = decodeStream streamUtf16BE
86 {-# INLINE decodeUtf16BE #-}
87
88 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89 decodeUtf32LE = decodeStream streamUtf32LE
90 {-# INLINE decodeUtf32LE #-}
91
92 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93 decodeUtf32BE = decodeStream streamUtf32BE
94 {-# INLINE decodeUtf32BE #-}
95
96 mkCodec :: (forall r m . Monad m =>
97 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
98 -> (Text -> ByteString)
99 -> Codec
100 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
101
102
103 {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
104 text stream ends by returning the original bytestream beginning at the point of failure,
105 or the empty bytestring for a well-encoded text.
106 -}
107
108 utf8 :: Codec
109 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
110
111 utf8Pure :: Codec
112 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
113
114 utf16LE :: Codec
115 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
116
117 utf16BE :: Codec
118 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
119
120 utf32LE :: Codec
121 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
122
123 utf32BE :: Codec
124 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
125
126
127 {- | ascii and latin encodings only use a small number of the characters 'Text'
128 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
129 Rather we simply define functions each way.
130
131 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
132 returning the rest of the 'Text' at the first non-ascii 'Char'
133 -}
134
135 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
136 encodeAscii = go where
137 go p = do e <- lift (next p)
138 case e of
139 Left r -> return (return r)
140 Right (chunk, p') ->
141 if T.null chunk
142 then go p'
143 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
144 in do yield (B8.pack (T.unpack safe))
145 if T.null unsafe
146 then go p'
147 else return $ do yield unsafe
148 p'
149
150 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
151 returning the rest of the 'Text' upon hitting any non-latin 'Char'
152 -}
153 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
154 encodeIso8859_1 = go where
155 go p = do e <- lift (next p)
156 case e of
157 Left r -> return (return r)
158 Right (txt, p') ->
159 if T.null txt
160 then go p'
161 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
162 in do yield (B8.pack (T.unpack safe))
163 if T.null unsafe
164 then go p'
165 else return $ do yield unsafe
166 p'
167
168 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
169 unused 'ByteString' upon hitting an un-ascii byte.
170 -}
171 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
172 decodeAscii = go where
173 go p = do e <- lift (next p)
174 case e of
175 Left r -> return (return r)
176 Right (chunk, p') ->
177 if B.null chunk
178 then go p'
179 else let (safe, unsafe) = B.span (<= 0x7F) chunk
180 in do yield (T.pack (B8.unpack safe))
181 if B.null unsafe
182 then go p'
183 else return (do yield unsafe
184 p')
185
186 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
187 unused 'ByteString' upon hitting the rare un-latinizable byte.
188 -}
189 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
190 decodeIso8859_1 = go where
191 go p = do e <- lift (next p)
192 case e of
193 Left r -> return (return r)
194 Right (chunk, p') ->
195 if B.null chunk
196 then go p'
197 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
198 yield (T.pack (B8.unpack safe))
199 if B.null unsafe
200 then go p'
201 else return (do yield unsafe
202 p')
203
204
205