diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-02-04 18:29:15 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-02-04 18:29:15 -0500 |
commit | 9018941435a48aa5437981dfdb1377aa14b13159 (patch) | |
tree | de9f1a119a8a0ced8cc0ede97012f2ccc5bb3d30 /Pipes/Text | |
parent | 4cbc92cc93073d1a5b99a03ad802f710d0205994 (diff) | |
download | text-pipes-9018941435a48aa5437981dfdb1377aa14b13159.tar.gz text-pipes-9018941435a48aa5437981dfdb1377aa14b13159.tar.zst text-pipes-9018941435a48aa5437981dfdb1377aa14b13159.zip |
finished draft of codec lens, ascii and iso8859 are not codecs, so they are given special functions
Diffstat (limited to 'Pipes/Text')
-rw-r--r-- | Pipes/Text/Codec.hs | 51 |
1 files changed, 13 insertions, 38 deletions
diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs index e4357b9..070b0d9 100644 --- a/Pipes/Text/Codec.hs +++ b/Pipes/Text/Codec.hs | |||
@@ -1,11 +1,9 @@ | |||
1 | 1 | ||
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | 2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} |
3 | -- | | 3 | -- | |
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | 4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin |
5 | -- License: MIT | 5 | -- License: MIT |
6 | -- | 6 | -- |
7 | -- Handle streams of text. | ||
8 | -- | ||
9 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. | 7 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. |
10 | 8 | ||
11 | module Pipes.Text.Codec | 9 | module Pipes.Text.Codec |
@@ -15,6 +13,10 @@ module Pipes.Text.Codec | |||
15 | , Codec(..) | 13 | , Codec(..) |
16 | , TextException(..) | 14 | , TextException(..) |
17 | , utf8 | 15 | , utf8 |
16 | , utf16_le | ||
17 | , utf16_be | ||
18 | , utf32_le | ||
19 | , utf32_be | ||
18 | ) where | 20 | ) where |
19 | 21 | ||
20 | import Data.Bits ((.&.)) | 22 | import Data.Bits ((.&.)) |
@@ -37,7 +39,7 @@ import Data.Typeable | |||
37 | import Control.Arrow (first) | 39 | import Control.Arrow (first) |
38 | import Data.Maybe (catMaybes) | 40 | import Data.Maybe (catMaybes) |
39 | import Pipes.Text.Internal | 41 | import Pipes.Text.Internal |
40 | 42 | import Pipes | |
41 | -- | A specific character encoding. | 43 | -- | A specific character encoding. |
42 | -- | 44 | -- |
43 | -- Since 0.3.0 | 45 | -- Since 0.3.0 |
@@ -62,10 +64,12 @@ instance Exc.Exception TextException | |||
62 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | 64 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) |
63 | -> (ByteString -> Decoding) | 65 | -> (ByteString -> Decoding) |
64 | toDecoding op = loop B.empty where | 66 | toDecoding op = loop B.empty where |
65 | loop extra bs0 = case op (B.append extra bs0) of | 67 | loop !extra bs0 = case op (B.append extra bs0) of |
66 | (txt, Right bs) -> Some txt bs (loop bs) | 68 | (txt, Right bs) -> Some txt bs (loop bs) |
67 | (txt, Left (_,bs)) -> Other txt bs | 69 | (txt, Left (_,bs)) -> Other txt bs |
68 | 70 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | |
71 | -- or else Codec changed to the conduit/enumerator definition. We have | ||
72 | -- altered it to use 'streamDecodeUtf8' | ||
69 | 73 | ||
70 | splitSlowly :: (ByteString -> Text) | 74 | splitSlowly :: (ByteString -> Text) |
71 | -> ByteString | 75 | -> ByteString |
@@ -87,8 +91,7 @@ utf8 :: Codec | |||
87 | utf8 = Codec name enc (toDecoding dec) where | 91 | utf8 = Codec name enc (toDecoding dec) where |
88 | name = T.pack "UTF-8" | 92 | name = T.pack "UTF-8" |
89 | enc text = (TE.encodeUtf8 text, Nothing) | 93 | enc text = (TE.encodeUtf8 text, Nothing) |
90 | dec bytes = case decodeSomeUtf8 bytes of | 94 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) |
91 | (t,b) -> (t, Right b) | ||
92 | 95 | ||
93 | -- -- Whether the given byte is a continuation byte. | 96 | -- -- Whether the given byte is a continuation byte. |
94 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | 97 | -- isContinuation byte = byte .&. 0xC0 == 0x80 |
@@ -202,34 +205,6 @@ utf32SplitBytes dec bytes = split where | |||
202 | then (bytes, B.empty) | 205 | then (bytes, B.empty) |
203 | else B.splitAt lenToDecode bytes | 206 | else B.splitAt lenToDecode bytes |
204 | 207 | ||
205 | ascii :: Codec | ||
206 | ascii = Codec name enc (toDecoding dec) where | ||
207 | name = T.pack "ASCII" | ||
208 | enc text = (bytes, extra) where | ||
209 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
210 | bytes = B8.pack (T.unpack safe) | ||
211 | extra = if T.null unsafe | ||
212 | then Nothing | ||
213 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
214 | |||
215 | dec bytes = (text, extra) where | ||
216 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
217 | text = T.pack (B8.unpack safe) | ||
218 | extra = if B.null unsafe | ||
219 | then Right B.empty | ||
220 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
221 | |||
222 | iso8859_1 :: Codec | ||
223 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
224 | name = T.pack "ISO-8859-1" | ||
225 | enc text = (bytes, extra) where | ||
226 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
227 | bytes = B8.pack (T.unpack safe) | ||
228 | extra = if T.null unsafe | ||
229 | then Nothing | ||
230 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
231 | |||
232 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
233 | 208 | ||
234 | tryEvaluate :: a -> Either Exc.SomeException a | 209 | tryEvaluate :: a -> Either Exc.SomeException a |
235 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | 210 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate |