]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Encoding.hs
improve documentation Pipes.Text.IO
[github/fretlink/text-pipes.git] / Pipes / Text / Encoding.hs
CommitLineData
bbdfd305 1
2{-# LANGUAGE RankNTypes, BangPatterns #-}
3-- |
89d80557 4
bbdfd305 5-- This module uses the stream decoding functions from the text-stream-decoding package
6-- to define pipes decoding functions and lenses.
7
8module Pipes.Text.Encoding
89d80557 9 ( Codec
bbdfd305 10 , utf8
11 , utf8Pure
12 , utf16LE
13 , utf16BE
14 , utf32LE
15 , utf32BE
89d80557 16 , decodeUtf8
17 , decodeUtf8Pure
18 , decodeUtf16LE
19 , decodeUtf16BE
20 , decodeUtf32LE
21 , decodeUtf32BE
bbdfd305 22 , encodeAscii
23 , decodeAscii
24 , encodeIso8859_1
25 , decodeIso8859_1
26 )
27 where
28
29import Data.Char (ord)
30import Data.ByteString as B
31import Data.ByteString (ByteString)
bbdfd305 32import Data.ByteString.Char8 as B8
33import Data.Text (Text)
34import qualified Data.Text as T
35import qualified Data.Text.Encoding as TE
36import Data.Text.StreamDecoding
70125641 37import Control.Monad (join)
89d80557 38import Data.Word (Word8)
bbdfd305 39import Pipes
bbdfd305 40
41
42
43{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
44 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
45 The stream of text they 'see' in a bytestream ends by returning the original byte stream
46 beginning at the point of failure, or the empty bytestream with its return value.
47 -}
48type Codec = forall f m r . (Functor f , Monad m ) =>
49 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
50 -> Producer ByteString m r -> f (Producer ByteString m r )
51
52decodeStream :: Monad m
53 => (B.ByteString -> DecodeResult)
54 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
55decodeStream = loop where
56 loop dec0 p =
57 do x <- lift (next p)
58 case x of Left r -> return (return r)
59 Right (chunk, p') -> case dec0 chunk of
60 DecodeResultSuccess text dec -> do yield text
61 loop dec p'
62 DecodeResultFailure text bs -> do yield text
63 return (do yield bs
64 p')
65{-# INLINABLE decodeStream#-}
66
67decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
68decodeUtf8 = decodeStream streamUtf8
69{-# INLINE decodeUtf8 #-}
70
71decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
72decodeUtf8Pure = decodeStream streamUtf8Pure
73{-# INLINE decodeUtf8Pure #-}
74
75decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
76decodeUtf16LE = decodeStream streamUtf16LE
77{-# INLINE decodeUtf16LE #-}
78
79decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
80decodeUtf16BE = decodeStream streamUtf16BE
81{-# INLINE decodeUtf16BE #-}
82
83decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
84decodeUtf32LE = decodeStream streamUtf32LE
85{-# INLINE decodeUtf32LE #-}
86
87decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
88decodeUtf32BE = decodeStream streamUtf32BE
89{-# INLINE decodeUtf32BE #-}
90
91mkCodec :: (forall r m . Monad m =>
92 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
93 -> (Text -> ByteString)
94 -> Codec
95mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
96
97
98{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
99 text stream ends by returning the original bytestream beginning at the point of failure,
100 or the empty bytestring for a well-encoded text.
101 -}
102
103utf8 :: Codec
104utf8 = mkCodec decodeUtf8 TE.encodeUtf8
105
106utf8Pure :: Codec
107utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
108
109utf16LE :: Codec
110utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
111
112utf16BE :: Codec
113utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
114
115utf32LE :: Codec
116utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
117
118utf32BE :: Codec
119utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
120
121
122{- | ascii and latin encodings only use a small number of the characters 'Text'
123 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
124 Rather we simply define functions each way.
125
126 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
127 returning the rest of the 'Text' at the first non-ascii 'Char'
128-}
129
130encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
131encodeAscii = go where
132 go p = do e <- lift (next p)
133 case e of
134 Left r -> return (return r)
135 Right (chunk, p') ->
136 if T.null chunk
137 then go p'
138 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
139 in do yield (B8.pack (T.unpack safe))
140 if T.null unsafe
141 then go p'
142 else return $ do yield unsafe
143 p'
144
145{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
146 returning the rest of the 'Text' upon hitting any non-latin 'Char'
147 -}
148encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
149encodeIso8859_1 = go where
150 go p = do e <- lift (next p)
151 case e of
152 Left r -> return (return r)
153 Right (txt, p') ->
154 if T.null txt
155 then go p'
156 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
157 in do yield (B8.pack (T.unpack safe))
158 if T.null unsafe
159 then go p'
160 else return $ do yield unsafe
161 p'
162
163{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
164 unused 'ByteString' upon hitting an un-ascii byte.
165 -}
166decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
167decodeAscii = go where
168 go p = do e <- lift (next p)
169 case e of
170 Left r -> return (return r)
171 Right (chunk, p') ->
172 if B.null chunk
173 then go p'
174 else let (safe, unsafe) = B.span (<= 0x7F) chunk
175 in do yield (T.pack (B8.unpack safe))
176 if B.null unsafe
177 then go p'
178 else return (do yield unsafe
179 p')
180
181{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
182 unused 'ByteString' upon hitting the rare un-latinizable byte.
183 -}
184decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
185decodeIso8859_1 = go where
186 go p = do e <- lift (next p)
187 case e of
188 Left r -> return (return r)
189 Right (chunk, p') ->
190 if B.null chunk
191 then go p'
192 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
193 yield (T.pack (B8.unpack safe))
194 if B.null unsafe
195 then go p'
196 else return (do yield unsafe
197 p')
198
199
200