1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
|
{-# LANGUAGE RankNTypes, BangPatterns #-}
-- |
-- This module uses the stream decoding functions from the text-stream-decoding package
-- to define pipes decoding functions and lenses.
module Pipes.Text.Encoding
( Codec
, utf8
, utf8Pure
, utf16LE
, utf16BE
, utf32LE
, utf32BE
, decodeUtf8
, decodeUtf8Pure
, decodeUtf16LE
, decodeUtf16BE
, decodeUtf32LE
, decodeUtf32BE
, encodeAscii
, decodeAscii
, encodeIso8859_1
, decodeIso8859_1
)
where
import Data.Char (ord)
import Data.ByteString as B
import Data.ByteString (ByteString)
import Data.ByteString.Char8 as B8
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Data.Text.StreamDecoding
import Control.Monad (join)
import Data.Word (Word8)
import Pipes
type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
The stream of text they 'see' in a bytestream ends by returning the original byte stream
beginning at the point of failure, or the empty bytestream with its return value.
-}
type Codec
= forall m r
. Monad m
=> Lens' (Producer ByteString m r)
(Producer Text m (Producer ByteString m r))
decodeStream :: Monad m
=> (B.ByteString -> DecodeResult)
-> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeStream = loop where
loop dec0 p =
do x <- lift (next p)
case x of Left r -> return (return r)
Right (chunk, p') -> case dec0 chunk of
DecodeResultSuccess text dec -> do yield text
loop dec p'
DecodeResultFailure text bs -> do yield text
return (do yield bs
p')
{-# INLINABLE decodeStream#-}
decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf8 = decodeStream streamUtf8
{-# INLINE decodeUtf8 #-}
decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf8Pure = decodeStream streamUtf8Pure
{-# INLINE decodeUtf8Pure #-}
decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf16LE = decodeStream streamUtf16LE
{-# INLINE decodeUtf16LE #-}
decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf16BE = decodeStream streamUtf16BE
{-# INLINE decodeUtf16BE #-}
decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf32LE = decodeStream streamUtf32LE
{-# INLINE decodeUtf32LE #-}
decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf32BE = decodeStream streamUtf32BE
{-# INLINE decodeUtf32BE #-}
mkCodec :: (forall r m . Monad m =>
Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
-> (Text -> ByteString)
-> Codec
mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
text stream ends by returning the original bytestream beginning at the point of failure,
or the empty bytestring for a well-encoded text.
-}
utf8 :: Codec
utf8 = mkCodec decodeUtf8 TE.encodeUtf8
utf8Pure :: Codec
utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
utf16LE :: Codec
utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
utf16BE :: Codec
utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
utf32LE :: Codec
utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
utf32BE :: Codec
utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
{- | ascii and latin encodings only use a small number of the characters 'Text'
recognizes; thus we cannot use the pipes 'Lens' style to work with them.
Rather we simply define functions each way.
'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
returning the rest of the 'Text' at the first non-ascii 'Char'
-}
encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
encodeAscii = go where
go p = do e <- lift (next p)
case e of
Left r -> return (return r)
Right (chunk, p') ->
if T.null chunk
then go p'
else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
in do yield (B8.pack (T.unpack safe))
if T.null unsafe
then go p'
else return $ do yield unsafe
p'
{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
returning the rest of the 'Text' upon hitting any non-latin 'Char'
-}
encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
encodeIso8859_1 = go where
go p = do e <- lift (next p)
case e of
Left r -> return (return r)
Right (txt, p') ->
if T.null txt
then go p'
else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
in do yield (B8.pack (T.unpack safe))
if T.null unsafe
then go p'
else return $ do yield unsafe
p'
{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
unused 'ByteString' upon hitting an un-ascii byte.
-}
decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeAscii = go where
go p = do e <- lift (next p)
case e of
Left r -> return (return r)
Right (chunk, p') ->
if B.null chunk
then go p'
else let (safe, unsafe) = B.span (<= 0x7F) chunk
in do yield (T.pack (B8.unpack safe))
if B.null unsafe
then go p'
else return (do yield unsafe
p')
{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
unused 'ByteString' upon hitting the rare un-latinizable byte.
-}
decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeIso8859_1 = go where
go p = do e <- lift (next p)
case e of
Left r -> return (return r)
Right (chunk, p') ->
if B.null chunk
then go p'
else do let (safe, unsafe) = B.span (<= 0xFF) chunk
yield (T.pack (B8.unpack safe))
if B.null unsafe
then go p'
else return (do yield unsafe
p')
|