2 {-# LANGUAGE RankNTypes, BangPatterns #-}
4 -- Copyright: 2014 Michael Thompson
6 -- This module uses the stream decoding functions from the text-stream-decoding package
7 -- to define pipes decoding functions and lenses.
9 module Pipes.Text.Encoding
31 import Data.Char (ord)
32 import Data.ByteString as B
33 import Data.ByteString (ByteString)
34 import Data.ByteString.Internal as B
35 import Data.ByteString.Char8 as B8
36 import Data.Text (Text)
37 import qualified Data.Text as T
38 import qualified Data.Text.Encoding as TE
39 import Data.Text.StreamDecoding
40 import GHC.Word (Word8, Word32)
41 import Data.Word (Word8, Word16)
48 {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
49 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
50 The stream of text they 'see' in a bytestream ends by returning the original byte stream
51 beginning at the point of failure, or the empty bytestream with its return value.
53 type Codec = forall f m r . (Functor f , Monad m ) =>
54 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
55 -> Producer ByteString m r -> f (Producer ByteString m r )
57 decodeStream :: Monad m
58 => (B.ByteString -> DecodeResult)
59 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
60 decodeStream = loop where
63 case x of Left r -> return (return r)
64 Right (chunk, p') -> case dec0 chunk of
65 DecodeResultSuccess text dec -> do yield text
67 DecodeResultFailure text bs -> do yield text
70 {-# INLINABLE decodeStream#-}
72 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73 decodeUtf8 = decodeStream streamUtf8
74 {-# INLINE decodeUtf8 #-}
76 decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
77 decodeUtf8Pure = decodeStream streamUtf8Pure
78 {-# INLINE decodeUtf8Pure #-}
80 decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
81 decodeUtf16LE = decodeStream streamUtf16LE
82 {-# INLINE decodeUtf16LE #-}
84 decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
85 decodeUtf16BE = decodeStream streamUtf16BE
86 {-# INLINE decodeUtf16BE #-}
88 decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89 decodeUtf32LE = decodeStream streamUtf32LE
90 {-# INLINE decodeUtf32LE #-}
92 decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93 decodeUtf32BE = decodeStream streamUtf32BE
94 {-# INLINE decodeUtf32BE #-}
96 mkCodec :: (forall r m . Monad m =>
97 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
98 -> (Text -> ByteString)
100 mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
103 {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
104 text stream ends by returning the original bytestream beginning at the point of failure,
105 or the empty bytestring for a well-encoded text.
109 utf8 = mkCodec decodeUtf8 TE.encodeUtf8
112 utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
115 utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
118 utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
121 utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
124 utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
127 {- | ascii and latin encodings only use a small number of the characters 'Text'
128 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
129 Rather we simply define functions each way.
131 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
132 returning the rest of the 'Text' at the first non-ascii 'Char'
135 encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
136 encodeAscii = go where
137 go p = do e <- lift (next p)
139 Left r -> return (return r)
143 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
144 in do yield (B8.pack (T.unpack safe))
147 else return $ do yield unsafe
150 {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
151 returning the rest of the 'Text' upon hitting any non-latin 'Char'
153 encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
154 encodeIso8859_1 = go where
155 go p = do e <- lift (next p)
157 Left r -> return (return r)
161 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
162 in do yield (B8.pack (T.unpack safe))
165 else return $ do yield unsafe
168 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
169 unused 'ByteString' upon hitting an un-ascii byte.
171 decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
172 decodeAscii = go where
173 go p = do e <- lift (next p)
175 Left r -> return (return r)
179 else let (safe, unsafe) = B.span (<= 0x7F) chunk
180 in do yield (T.pack (B8.unpack safe))
183 else return (do yield unsafe
186 {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
187 unused 'ByteString' upon hitting the rare un-latinizable byte.
189 decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
190 decodeIso8859_1 = go where
191 go p = do e <- lift (next p)
193 Left r -> return (return r)
197 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
198 yield (T.pack (B8.unpack safe))
201 else return (do yield unsafe