aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text/Encoding.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text/Encoding.hs')
-rw-r--r--Pipes/Text/Encoding.hs205
1 files changed, 205 insertions, 0 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs
new file mode 100644
index 0000000..2bb5807
--- /dev/null
+++ b/Pipes/Text/Encoding.hs
@@ -0,0 +1,205 @@
1
2{-# LANGUAGE RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson
5--
6-- This module uses the stream decoding functions from the text-stream-decoding package
7-- to define pipes decoding functions and lenses.
8
9module Pipes.Text.Encoding
10 ( DecodeResult (..)
11 , Codec
12 , decodeUtf8
13 , decodeUtf8Pure
14 , decodeUtf16LE
15 , decodeUtf16BE
16 , decodeUtf32LE
17 , decodeUtf32BE
18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
24 , encodeAscii
25 , decodeAscii
26 , encodeIso8859_1
27 , decodeIso8859_1
28 )
29 where
30
31import Data.Char (ord)
32import Data.ByteString as B
33import Data.ByteString (ByteString)
34import Data.ByteString.Internal as B
35import Data.ByteString.Char8 as B8
36import Data.Text (Text)
37import qualified Data.Text as T
38import qualified Data.Text.Encoding as TE
39import Data.Text.StreamDecoding
40import GHC.Word (Word8, Word32)
41import Data.Word (Word8, Word16)
42import Control.Monad
43import Pipes
44import Pipes.Core
45
46
47
48{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
49 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
50 The stream of text they 'see' in a bytestream ends by returning the original byte stream
51 beginning at the point of failure, or the empty bytestream with its return value.
52 -}
53type Codec = forall f m r . (Functor f , Monad m ) =>
54 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
55 -> Producer ByteString m r -> f (Producer ByteString m r )
56
57decodeStream :: Monad m
58 => (B.ByteString -> DecodeResult)
59 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
60decodeStream = loop where
61 loop dec0 p =
62 do x <- lift (next p)
63 case x of Left r -> return (return r)
64 Right (chunk, p') -> case dec0 chunk of
65 DecodeResultSuccess text dec -> do yield text
66 loop dec p'
67 DecodeResultFailure text bs -> do yield text
68 return (do yield bs
69 p')
70{-# INLINABLE decodeStream#-}
71
72decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73decodeUtf8 = decodeStream streamUtf8
74{-# INLINE decodeUtf8 #-}
75
76decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
77decodeUtf8Pure = decodeStream streamUtf8Pure
78{-# INLINE decodeUtf8Pure #-}
79
80decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
81decodeUtf16LE = decodeStream streamUtf16LE
82{-# INLINE decodeUtf16LE #-}
83
84decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
85decodeUtf16BE = decodeStream streamUtf16BE
86{-# INLINE decodeUtf16BE #-}
87
88decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89decodeUtf32LE = decodeStream streamUtf32LE
90{-# INLINE decodeUtf32LE #-}
91
92decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93decodeUtf32BE = decodeStream streamUtf32BE
94{-# INLINE decodeUtf32BE #-}
95
96mkCodec :: (forall r m . Monad m =>
97 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
98 -> (Text -> ByteString)
99 -> Codec
100mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
101
102
103{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
104 text stream ends by returning the original bytestream beginning at the point of failure,
105 or the empty bytestring for a well-encoded text.
106 -}
107
108utf8 :: Codec
109utf8 = mkCodec decodeUtf8 TE.encodeUtf8
110
111utf8Pure :: Codec
112utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
113
114utf16LE :: Codec
115utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
116
117utf16BE :: Codec
118utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
119
120utf32LE :: Codec
121utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
122
123utf32BE :: Codec
124utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
125
126
127{- | ascii and latin encodings only use a small number of the characters 'Text'
128 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
129 Rather we simply define functions each way.
130
131 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
132 returning the rest of the 'Text' at the first non-ascii 'Char'
133-}
134
135encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
136encodeAscii = go where
137 go p = do e <- lift (next p)
138 case e of
139 Left r -> return (return r)
140 Right (chunk, p') ->
141 if T.null chunk
142 then go p'
143 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
144 in do yield (B8.pack (T.unpack safe))
145 if T.null unsafe
146 then go p'
147 else return $ do yield unsafe
148 p'
149
150{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
151 returning the rest of the 'Text' upon hitting any non-latin 'Char'
152 -}
153encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
154encodeIso8859_1 = go where
155 go p = do e <- lift (next p)
156 case e of
157 Left r -> return (return r)
158 Right (txt, p') ->
159 if T.null txt
160 then go p'
161 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
162 in do yield (B8.pack (T.unpack safe))
163 if T.null unsafe
164 then go p'
165 else return $ do yield unsafe
166 p'
167
168{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
169 unused 'ByteString' upon hitting an un-ascii byte.
170 -}
171decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
172decodeAscii = go where
173 go p = do e <- lift (next p)
174 case e of
175 Left r -> return (return r)
176 Right (chunk, p') ->
177 if B.null chunk
178 then go p'
179 else let (safe, unsafe) = B.span (<= 0x7F) chunk
180 in do yield (T.pack (B8.unpack safe))
181 if B.null unsafe
182 then go p'
183 else return (do yield unsafe
184 p')
185
186{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
187 unused 'ByteString' upon hitting the rare un-latinizable byte.
188 -}
189decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
190decodeIso8859_1 = go where
191 go p = do e <- lift (next p)
192 case e of
193 Left r -> return (return r)
194 Right (chunk, p') ->
195 if B.null chunk
196 then go p'
197 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
198 yield (T.pack (B8.unpack safe))
199 if B.null unsafe
200 then go p'
201 else return (do yield unsafe
202 p')
203
204
205