diff options
-rw-r--r-- | Pipes/Text.hs | 171 | ||||
-rw-r--r-- | Pipes/Text/Codec.hs | 51 |
2 files changed, 142 insertions, 80 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 9ed0d78..18ec8ec 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -123,8 +123,23 @@ module Pipes.Text ( | |||
123 | , group | 123 | , group |
124 | , word | 124 | , word |
125 | , line | 125 | , line |
126 | |||
127 | -- * Decoding Lenses | ||
126 | , decodeUtf8 | 128 | , decodeUtf8 |
127 | , decode | 129 | , codec |
130 | |||
131 | -- * Codecs | ||
132 | , utf8 | ||
133 | , utf16_le | ||
134 | , utf16_be | ||
135 | , utf32_le | ||
136 | , utf32_be | ||
137 | |||
138 | -- * Other Decoding/Encoding Functions | ||
139 | , decodeIso8859_1 | ||
140 | , decodeAscii | ||
141 | , encodeIso8859_1 | ||
142 | , encodeAscii | ||
128 | 143 | ||
129 | -- * FreeT Splitters | 144 | -- * FreeT Splitters |
130 | , chunksOf | 145 | , chunksOf |
@@ -170,6 +185,7 @@ import Data.Text.Lazy.Internal (foldrChunks, defaultChunkSize) | |||
170 | import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) | 185 | import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) |
171 | import Data.ByteString (ByteString) | 186 | import Data.ByteString (ByteString) |
172 | import qualified Data.ByteString as B | 187 | import qualified Data.ByteString as B |
188 | import qualified Data.ByteString.Char8 as B8 | ||
173 | import Data.Char (ord, isSpace) | 189 | import Data.Char (ord, isSpace) |
174 | import Data.Functor.Constant (Constant(Constant, getConstant)) | 190 | import Data.Functor.Constant (Constant(Constant, getConstant)) |
175 | import Data.Functor.Identity (Identity) | 191 | import Data.Functor.Identity (Identity) |
@@ -181,7 +197,7 @@ import qualified GHC.IO.Exception as G | |||
181 | import Pipes | 197 | import Pipes |
182 | import qualified Pipes.ByteString as PB | 198 | import qualified Pipes.ByteString as PB |
183 | import qualified Pipes.Text.Internal as PE | 199 | import qualified Pipes.Text.Internal as PE |
184 | import Pipes.Text.Codec (Codec(..)) | 200 | import Pipes.Text.Codec |
185 | import Pipes.Core (respond, Server') | 201 | import Pipes.Core (respond, Server') |
186 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) | 202 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) |
187 | import qualified Pipes.Group as PG | 203 | import qualified Pipes.Group as PG |
@@ -1073,43 +1089,114 @@ unwords = intercalate (yield $ T.singleton ' ') | |||
1073 | @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. | 1089 | @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. |
1074 | -} | 1090 | -} |
1075 | 1091 | ||
1076 | 1092 | codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) | |
1077 | 1093 | codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) | |
1078 | decode :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 1094 | (k (decoder (dec B.empty) p0) ) where |
1079 | -- decode codec = go B.empty where | 1095 | decoder :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
1080 | -- go extra p0 = | 1096 | decoder !d p0 = case d of |
1081 | -- do x <- lift (next p0) | 1097 | PE.Other txt bad -> do yield txt |
1082 | -- case x of Right (chunk, p) -> | 1098 | return (do yield bad |
1083 | -- do let (text, stuff) = codecDecode codec (B.append extra chunk) | 1099 | p0) |
1084 | -- yield text | 1100 | PE.Some txt extra dec -> do yield txt |
1085 | -- case stuff of Right extra' -> go extra' p | 1101 | x <- lift (next p0) |
1086 | -- Left (exc,bs) -> do yield text | 1102 | case x of Left r -> return (do yield extra |
1087 | -- return (do yield bs | 1103 | return r) |
1088 | -- p) | 1104 | Right (chunk,p1) -> decoder (dec chunk) p1 |
1089 | -- Left r -> return (do yield extra | 1105 | |
1090 | -- return r) | 1106 | -- decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) |
1091 | 1107 | -- (k (go B.empty PE.streamDecodeUtf8 p0)) where | |
1092 | decode d p0 = case d of | 1108 | |
1093 | PE.Other txt bad -> do yield txt | 1109 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
1094 | return (do yield bad | 1110 | encodeAscii = go where |
1095 | p0) | 1111 | go p = do echunk <- lift (next p) |
1096 | PE.Some txt extra dec -> do yield txt | 1112 | case echunk of |
1097 | x <- lift (next p0) | 1113 | Left r -> return (return r) |
1098 | case x of Left r -> return (do yield extra | 1114 | Right (chunk, p') -> |
1099 | return r) | 1115 | if T.null chunk |
1100 | Right (chunk,p1) -> decode (dec chunk) p1 | 1116 | then go p' |
1101 | 1117 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | |
1102 | -- go !carry dec0 p = do | 1118 | in do yield (B8.pack (T.unpack safe)) |
1103 | -- x <- lift (next p) | 1119 | if T.null unsafe |
1104 | -- case x of Left r -> if B.null carry | 1120 | then go p' |
1105 | -- then return (return r) -- all bytestrinput was consumed | 1121 | else return $ do yield unsafe |
1106 | -- else return (do yield carry -- a potentially valid fragment remains | 1122 | p' |
1107 | -- return r) | 1123 | |
1108 | -- | 1124 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) |
1109 | -- Right (chunk, p') -> case dec0 chunk of | 1125 | encodeIso8859_1 = go where |
1110 | -- PE.Some text carry2 dec -> do yield text | 1126 | go p = do etxt <- lift (next p) |
1111 | -- go carry2 dec p' | 1127 | case etxt of |
1112 | -- PE.Other text bs -> do yield text | 1128 | Left r -> return (return r) |
1113 | -- return (do yield bs -- an invalid blob remains | 1129 | Right (txt, p') -> |
1114 | -- p') | 1130 | if T.null txt |
1115 | -- {-# INLINABLE decodeUtf8 #-} \ No newline at end of file | 1131 | then go p' |
1132 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | ||
1133 | in do yield (B8.pack (T.unpack safe)) | ||
1134 | if T.null unsafe | ||
1135 | then go p' | ||
1136 | else return $ do yield unsafe | ||
1137 | p' | ||
1138 | |||
1139 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
1140 | decodeAscii = go where | ||
1141 | go p = do echunk <- lift (next p) | ||
1142 | case echunk of | ||
1143 | Left r -> return (return r) | ||
1144 | Right (chunk, p') -> | ||
1145 | if B.null chunk | ||
1146 | then go p' | ||
1147 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | ||
1148 | in do yield (T.pack (B8.unpack safe)) | ||
1149 | if B.null unsafe | ||
1150 | then go p' | ||
1151 | else return $ do yield unsafe | ||
1152 | p' | ||
1153 | |||
1154 | |||
1155 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
1156 | decodeIso8859_1 = go where | ||
1157 | go p = do echunk <- lift (next p) | ||
1158 | case echunk of | ||
1159 | Left r -> return (return r) | ||
1160 | Right (chunk, p') -> | ||
1161 | if B.null chunk | ||
1162 | then go p' | ||
1163 | else let (safe, unsafe) = B.span (<= 0xFF) chunk | ||
1164 | in do yield (T.pack (B8.unpack safe)) | ||
1165 | if B.null unsafe | ||
1166 | then go p' | ||
1167 | else return $ do yield unsafe | ||
1168 | p' | ||
1169 | |||
1170 | |||
1171 | |||
1172 | {- | ||
1173 | ascii :: Codec | ||
1174 | ascii = Codec name enc (toDecoding dec) where | ||
1175 | name = T.pack "ASCII" | ||
1176 | enc text = (bytes, extra) where | ||
1177 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
1178 | bytes = B8.pack (T.unpack safe) | ||
1179 | extra = if T.null unsafe | ||
1180 | then Nothing | ||
1181 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
1182 | |||
1183 | dec bytes = (text, extra) where | ||
1184 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
1185 | text = T.pack (B8.unpack safe) | ||
1186 | extra = if B.null unsafe | ||
1187 | then Right B.empty | ||
1188 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
1189 | |||
1190 | iso8859_1 :: Codec | ||
1191 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
1192 | name = T.pack "ISO-8859-1" | ||
1193 | enc text = (bytes, extra) where | ||
1194 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
1195 | bytes = B8.pack (T.unpack safe) | ||
1196 | extra = if T.null unsafe | ||
1197 | then Nothing | ||
1198 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
1199 | |||
1200 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
1201 | -} | ||
1202 | \ No newline at end of file | ||
diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs index e4357b9..070b0d9 100644 --- a/Pipes/Text/Codec.hs +++ b/Pipes/Text/Codec.hs | |||
@@ -1,11 +1,9 @@ | |||
1 | 1 | ||
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | 2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} |
3 | -- | | 3 | -- | |
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | 4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin |
5 | -- License: MIT | 5 | -- License: MIT |
6 | -- | 6 | -- |
7 | -- Handle streams of text. | ||
8 | -- | ||
9 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. | 7 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. |
10 | 8 | ||
11 | module Pipes.Text.Codec | 9 | module Pipes.Text.Codec |
@@ -15,6 +13,10 @@ module Pipes.Text.Codec | |||
15 | , Codec(..) | 13 | , Codec(..) |
16 | , TextException(..) | 14 | , TextException(..) |
17 | , utf8 | 15 | , utf8 |
16 | , utf16_le | ||
17 | , utf16_be | ||
18 | , utf32_le | ||
19 | , utf32_be | ||
18 | ) where | 20 | ) where |
19 | 21 | ||
20 | import Data.Bits ((.&.)) | 22 | import Data.Bits ((.&.)) |
@@ -37,7 +39,7 @@ import Data.Typeable | |||
37 | import Control.Arrow (first) | 39 | import Control.Arrow (first) |
38 | import Data.Maybe (catMaybes) | 40 | import Data.Maybe (catMaybes) |
39 | import Pipes.Text.Internal | 41 | import Pipes.Text.Internal |
40 | 42 | import Pipes | |
41 | -- | A specific character encoding. | 43 | -- | A specific character encoding. |
42 | -- | 44 | -- |
43 | -- Since 0.3.0 | 45 | -- Since 0.3.0 |
@@ -62,10 +64,12 @@ instance Exc.Exception TextException | |||
62 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | 64 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) |
63 | -> (ByteString -> Decoding) | 65 | -> (ByteString -> Decoding) |
64 | toDecoding op = loop B.empty where | 66 | toDecoding op = loop B.empty where |
65 | loop extra bs0 = case op (B.append extra bs0) of | 67 | loop !extra bs0 = case op (B.append extra bs0) of |
66 | (txt, Right bs) -> Some txt bs (loop bs) | 68 | (txt, Right bs) -> Some txt bs (loop bs) |
67 | (txt, Left (_,bs)) -> Other txt bs | 69 | (txt, Left (_,bs)) -> Other txt bs |
68 | 70 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | |
71 | -- or else Codec changed to the conduit/enumerator definition. We have | ||
72 | -- altered it to use 'streamDecodeUtf8' | ||
69 | 73 | ||
70 | splitSlowly :: (ByteString -> Text) | 74 | splitSlowly :: (ByteString -> Text) |
71 | -> ByteString | 75 | -> ByteString |
@@ -87,8 +91,7 @@ utf8 :: Codec | |||
87 | utf8 = Codec name enc (toDecoding dec) where | 91 | utf8 = Codec name enc (toDecoding dec) where |
88 | name = T.pack "UTF-8" | 92 | name = T.pack "UTF-8" |
89 | enc text = (TE.encodeUtf8 text, Nothing) | 93 | enc text = (TE.encodeUtf8 text, Nothing) |
90 | dec bytes = case decodeSomeUtf8 bytes of | 94 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) |
91 | (t,b) -> (t, Right b) | ||
92 | 95 | ||
93 | -- -- Whether the given byte is a continuation byte. | 96 | -- -- Whether the given byte is a continuation byte. |
94 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | 97 | -- isContinuation byte = byte .&. 0xC0 == 0x80 |
@@ -202,34 +205,6 @@ utf32SplitBytes dec bytes = split where | |||
202 | then (bytes, B.empty) | 205 | then (bytes, B.empty) |
203 | else B.splitAt lenToDecode bytes | 206 | else B.splitAt lenToDecode bytes |
204 | 207 | ||
205 | ascii :: Codec | ||
206 | ascii = Codec name enc (toDecoding dec) where | ||
207 | name = T.pack "ASCII" | ||
208 | enc text = (bytes, extra) where | ||
209 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
210 | bytes = B8.pack (T.unpack safe) | ||
211 | extra = if T.null unsafe | ||
212 | then Nothing | ||
213 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
214 | |||
215 | dec bytes = (text, extra) where | ||
216 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
217 | text = T.pack (B8.unpack safe) | ||
218 | extra = if B.null unsafe | ||
219 | then Right B.empty | ||
220 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
221 | |||
222 | iso8859_1 :: Codec | ||
223 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
224 | name = T.pack "ISO-8859-1" | ||
225 | enc text = (bytes, extra) where | ||
226 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
227 | bytes = B8.pack (T.unpack safe) | ||
228 | extra = if T.null unsafe | ||
229 | then Nothing | ||
230 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
231 | |||
232 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
233 | 208 | ||
234 | tryEvaluate :: a -> Either Exc.SomeException a | 209 | tryEvaluate :: a -> Either Exc.SomeException a |
235 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | 210 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate |