diff options
-rw-r--r-- | Pipes/Text.hs | 29 | ||||
-rw-r--r-- | Pipes/Text/Codec.hs | 240 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 212 | ||||
-rw-r--r-- | pipes-text.cabal | 2 |
4 files changed, 255 insertions, 228 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 71b1316..9ed0d78 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -181,7 +181,7 @@ import qualified GHC.IO.Exception as G | |||
181 | import Pipes | 181 | import Pipes |
182 | import qualified Pipes.ByteString as PB | 182 | import qualified Pipes.ByteString as PB |
183 | import qualified Pipes.Text.Internal as PE | 183 | import qualified Pipes.Text.Internal as PE |
184 | import Pipes.Text.Internal (Codec(..)) | 184 | import Pipes.Text.Codec (Codec(..)) |
185 | import Pipes.Core (respond, Server') | 185 | import Pipes.Core (respond, Server') |
186 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) | 186 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) |
187 | import qualified Pipes.Group as PG | 187 | import qualified Pipes.Group as PG |
@@ -243,8 +243,8 @@ stdin = fromHandle IO.stdin | |||
243 | fromHandle :: MonadIO m => IO.Handle -> Producer Text m () | 243 | fromHandle :: MonadIO m => IO.Handle -> Producer Text m () |
244 | fromHandle h = go where | 244 | fromHandle h = go where |
245 | go = do txt <- liftIO (T.hGetChunk h) | 245 | go = do txt <- liftIO (T.hGetChunk h) |
246 | unless (T.null txt) $ do yield txt | 246 | unless (T.null txt) ( do yield txt |
247 | go | 247 | go ) |
248 | {-# INLINABLE fromHandle#-} | 248 | {-# INLINABLE fromHandle#-} |
249 | 249 | ||
250 | 250 | ||
@@ -258,7 +258,9 @@ readFile :: MonadSafe m => FilePath -> Producer Text m () | |||
258 | readFile file = Safe.withFile file IO.ReadMode fromHandle | 258 | readFile file = Safe.withFile file IO.ReadMode fromHandle |
259 | {-# INLINE readFile #-} | 259 | {-# INLINE readFile #-} |
260 | 260 | ||
261 | {-| Stream lines of text from stdin (for testing in ghci etc.) | 261 | {-| Crudely stream lines of input from stdin in the style of Pipes.Prelude. |
262 | This is for testing in ghci etc.; obviously it will be unsound if used to recieve | ||
263 | the contents of immense files with few newlines. | ||
262 | 264 | ||
263 | >>> let safely = runSafeT . runEffect | 265 | >>> let safely = runSafeT . runEffect |
264 | >>> safely $ for Text.stdinLn (lift . lift . print . T.length) | 266 | >>> safely $ for Text.stdinLn (lift . lift . print . T.length) |
@@ -282,8 +284,8 @@ stdinLn = go where | |||
282 | 284 | ||
283 | Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. | 285 | Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. |
284 | 286 | ||
285 | Note: For best performance, use @(for source (liftIO . putStr))@ instead of | 287 | Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ |
286 | @(source >-> stdout)@ in suitable cases. | 288 | instead of @(source >-> stdout)@ . |
287 | -} | 289 | -} |
288 | stdout :: MonadIO m => Consumer' Text m () | 290 | stdout :: MonadIO m => Consumer' Text m () |
289 | stdout = go | 291 | stdout = go |
@@ -704,11 +706,8 @@ isEndOfChars = do | |||
704 | {-# INLINABLE isEndOfChars #-} | 706 | {-# INLINABLE isEndOfChars #-} |
705 | 707 | ||
706 | 708 | ||
707 | 709 | -- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated | |
708 | 710 | -- stream of Text ends by returning a stream of ByteStrings beginning at the point of failure. | |
709 | |||
710 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded into a Pipe of Text | ||
711 | -- returning a Pipe of ByteStrings that begins at the point of failure. | ||
712 | 711 | ||
713 | decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) | 712 | decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) |
714 | (Producer Text m (Producer ByteString m r)) | 713 | (Producer Text m (Producer ByteString m r)) |
@@ -716,10 +715,10 @@ decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) | |||
716 | (k (go B.empty PE.streamDecodeUtf8 p0)) where | 715 | (k (go B.empty PE.streamDecodeUtf8 p0)) where |
717 | go !carry dec0 p = do | 716 | go !carry dec0 p = do |
718 | x <- lift (next p) | 717 | x <- lift (next p) |
719 | case x of Left r -> if B.null carry | 718 | case x of Left r -> return (if B.null carry |
720 | then return (return r) -- all bytestrinput was consumed | 719 | then return r -- all bytestring input was consumed |
721 | else return (do yield carry -- a potentially valid fragment remains | 720 | else (do yield carry -- a potentially valid fragment remains |
722 | return r) | 721 | return r)) |
723 | 722 | ||
724 | Right (chunk, p') -> case dec0 chunk of | 723 | Right (chunk, p') -> case dec0 chunk of |
725 | PE.Some text carry2 dec -> do yield text | 724 | PE.Some text carry2 dec -> do yield text |
diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs new file mode 100644 index 0000000..e4357b9 --- /dev/null +++ b/Pipes/Text/Codec.hs | |||
@@ -0,0 +1,240 @@ | |||
1 | |||
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
3 | -- | | ||
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | ||
5 | -- License: MIT | ||
6 | -- | ||
7 | -- Handle streams of text. | ||
8 | -- | ||
9 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. | ||
10 | |||
11 | module Pipes.Text.Codec | ||
12 | ( Decoding(..) | ||
13 | , streamDecodeUtf8 | ||
14 | , decodeSomeUtf8 | ||
15 | , Codec(..) | ||
16 | , TextException(..) | ||
17 | , utf8 | ||
18 | ) where | ||
19 | |||
20 | import Data.Bits ((.&.)) | ||
21 | import Data.Char (ord) | ||
22 | import Data.ByteString as B | ||
23 | import Data.ByteString (ByteString) | ||
24 | import Data.ByteString.Internal as B | ||
25 | import Data.ByteString.Char8 as B8 | ||
26 | import Data.Text (Text) | ||
27 | import qualified Data.Text as T | ||
28 | import qualified Data.Text.Encoding as TE | ||
29 | import Data.Text.Encoding.Error () | ||
30 | import GHC.Word (Word8, Word32) | ||
31 | import qualified Data.Text.Array as A | ||
32 | import Data.Word (Word8, Word16) | ||
33 | import System.IO.Unsafe (unsafePerformIO) | ||
34 | import qualified Control.Exception as Exc | ||
35 | import Data.Bits ((.&.), (.|.), shiftL) | ||
36 | import Data.Typeable | ||
37 | import Control.Arrow (first) | ||
38 | import Data.Maybe (catMaybes) | ||
39 | import Pipes.Text.Internal | ||
40 | |||
41 | -- | A specific character encoding. | ||
42 | -- | ||
43 | -- Since 0.3.0 | ||
44 | data Codec = Codec | ||
45 | { codecName :: Text | ||
46 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | ||
47 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | ||
48 | } | ||
49 | |||
50 | instance Show Codec where | ||
51 | showsPrec d c = showParen (d > 10) $ | ||
52 | showString "Codec " . shows (codecName c) | ||
53 | |||
54 | data TextException = DecodeException Codec Word8 | ||
55 | | EncodeException Codec Char | ||
56 | | LengthExceeded Int | ||
57 | | TextException Exc.SomeException | ||
58 | deriving (Show, Typeable) | ||
59 | instance Exc.Exception TextException | ||
60 | |||
61 | |||
62 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | ||
63 | -> (ByteString -> Decoding) | ||
64 | toDecoding op = loop B.empty where | ||
65 | loop extra bs0 = case op (B.append extra bs0) of | ||
66 | (txt, Right bs) -> Some txt bs (loop bs) | ||
67 | (txt, Left (_,bs)) -> Other txt bs | ||
68 | |||
69 | |||
70 | splitSlowly :: (ByteString -> Text) | ||
71 | -> ByteString | ||
72 | -> (Text, Either (TextException, ByteString) ByteString) | ||
73 | splitSlowly dec bytes = valid where | ||
74 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | ||
75 | splits 0 = [(B.empty, bytes)] | ||
76 | splits n = B.splitAt n bytes : splits (n - 1) | ||
77 | decFirst (a, b) = case tryEvaluate (dec a) of | ||
78 | Left _ -> Nothing | ||
79 | Right text -> let trouble = case tryEvaluate (dec b) of | ||
80 | Left exc -> Left (TextException exc, b) | ||
81 | Right _ -> Right B.empty | ||
82 | in Just (text, trouble) -- this case shouldn't occur, | ||
83 | -- since splitSlowly is only called | ||
84 | -- when parsing failed somewhere | ||
85 | |||
86 | utf8 :: Codec | ||
87 | utf8 = Codec name enc (toDecoding dec) where | ||
88 | name = T.pack "UTF-8" | ||
89 | enc text = (TE.encodeUtf8 text, Nothing) | ||
90 | dec bytes = case decodeSomeUtf8 bytes of | ||
91 | (t,b) -> (t, Right b) | ||
92 | |||
93 | -- -- Whether the given byte is a continuation byte. | ||
94 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | ||
95 | -- | ||
96 | -- -- The number of continuation bytes needed by the given | ||
97 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | ||
98 | -- -- non-continuation byte and the whole split quickly must fail so | ||
99 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | ||
100 | -- -- suitable error. | ||
101 | -- required x0 | ||
102 | -- | x0 .&. 0x80 == 0x00 = 0 | ||
103 | -- | x0 .&. 0xE0 == 0xC0 = 1 | ||
104 | -- | x0 .&. 0xF0 == 0xE0 = 2 | ||
105 | -- | x0 .&. 0xF8 == 0xF0 = 3 | ||
106 | -- | otherwise = -1 | ||
107 | -- | ||
108 | -- splitQuickly bytes | ||
109 | -- | B.null l || req == -1 = Nothing | ||
110 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | ||
111 | -- | otherwise = Just (TE.decodeUtf8 l', r') | ||
112 | -- where | ||
113 | -- (l, r) = B.spanEnd isContinuation bytes | ||
114 | -- req = required (B.last l) | ||
115 | -- l' = B.init l | ||
116 | -- r' = B.cons (B.last l) r | ||
117 | |||
118 | |||
119 | utf16_le :: Codec | ||
120 | utf16_le = Codec name enc (toDecoding dec) where | ||
121 | name = T.pack "UTF-16-LE" | ||
122 | enc text = (TE.encodeUtf16LE text, Nothing) | ||
123 | dec bytes = case splitQuickly bytes of | ||
124 | Just (text, extra) -> (text, Right extra) | ||
125 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | ||
126 | |||
127 | splitQuickly bytes = maybeDecode (loop 0) where | ||
128 | maxN = B.length bytes | ||
129 | |||
130 | loop n | n == maxN = decodeAll | ||
131 | | (n + 1) == maxN = decodeTo n | ||
132 | loop n = let | ||
133 | req = utf16Required | ||
134 | (B.index bytes n) | ||
135 | (B.index bytes (n + 1)) | ||
136 | decodeMore = loop $! n + req | ||
137 | in if n + req > maxN | ||
138 | then decodeTo n | ||
139 | else decodeMore | ||
140 | |||
141 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | ||
142 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | ||
143 | |||
144 | utf16_be :: Codec | ||
145 | utf16_be = Codec name enc (toDecoding dec) where | ||
146 | name = T.pack "UTF-16-BE" | ||
147 | enc text = (TE.encodeUtf16BE text, Nothing) | ||
148 | dec bytes = case splitQuickly bytes of | ||
149 | Just (text, extra) -> (text, Right extra) | ||
150 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | ||
151 | |||
152 | splitQuickly bytes = maybeDecode (loop 0) where | ||
153 | maxN = B.length bytes | ||
154 | |||
155 | loop n | n == maxN = decodeAll | ||
156 | | (n + 1) == maxN = decodeTo n | ||
157 | loop n = let | ||
158 | req = utf16Required | ||
159 | (B.index bytes (n + 1)) | ||
160 | (B.index bytes n) | ||
161 | decodeMore = loop $! n + req | ||
162 | in if n + req > maxN | ||
163 | then decodeTo n | ||
164 | else decodeMore | ||
165 | |||
166 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | ||
167 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | ||
168 | |||
169 | utf16Required :: Word8 -> Word8 -> Int | ||
170 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | ||
171 | x :: Word16 | ||
172 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | ||
173 | |||
174 | |||
175 | utf32_le :: Codec | ||
176 | utf32_le = Codec name enc (toDecoding dec) where | ||
177 | name = T.pack "UTF-32-LE" | ||
178 | enc text = (TE.encodeUtf32LE text, Nothing) | ||
179 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | ||
180 | Just (text, extra) -> (text, Right extra) | ||
181 | Nothing -> splitSlowly TE.decodeUtf32LE bs | ||
182 | |||
183 | |||
184 | utf32_be :: Codec | ||
185 | utf32_be = Codec name enc (toDecoding dec) where | ||
186 | name = T.pack "UTF-32-BE" | ||
187 | enc text = (TE.encodeUtf32BE text, Nothing) | ||
188 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | ||
189 | Just (text, extra) -> (text, Right extra) | ||
190 | Nothing -> splitSlowly TE.decodeUtf32BE bs | ||
191 | |||
192 | utf32SplitBytes :: (ByteString -> Text) | ||
193 | -> ByteString | ||
194 | -> Maybe (Text, ByteString) | ||
195 | utf32SplitBytes dec bytes = split where | ||
196 | split = maybeDecode (dec toDecode, extra) | ||
197 | len = B.length bytes | ||
198 | lenExtra = mod len 4 | ||
199 | |||
200 | lenToDecode = len - lenExtra | ||
201 | (toDecode, extra) = if lenExtra == 0 | ||
202 | then (bytes, B.empty) | ||
203 | else B.splitAt lenToDecode bytes | ||
204 | |||
205 | ascii :: Codec | ||
206 | ascii = Codec name enc (toDecoding dec) where | ||
207 | name = T.pack "ASCII" | ||
208 | enc text = (bytes, extra) where | ||
209 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
210 | bytes = B8.pack (T.unpack safe) | ||
211 | extra = if T.null unsafe | ||
212 | then Nothing | ||
213 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
214 | |||
215 | dec bytes = (text, extra) where | ||
216 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
217 | text = T.pack (B8.unpack safe) | ||
218 | extra = if B.null unsafe | ||
219 | then Right B.empty | ||
220 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
221 | |||
222 | iso8859_1 :: Codec | ||
223 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
224 | name = T.pack "ISO-8859-1" | ||
225 | enc text = (bytes, extra) where | ||
226 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
227 | bytes = B8.pack (T.unpack safe) | ||
228 | extra = if T.null unsafe | ||
229 | then Nothing | ||
230 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
231 | |||
232 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
233 | |||
234 | tryEvaluate :: a -> Either Exc.SomeException a | ||
235 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | ||
236 | |||
237 | maybeDecode :: (a, b) -> Maybe (a, b) | ||
238 | maybeDecode (a, b) = case tryEvaluate a of | ||
239 | Left _ -> Nothing | ||
240 | Right _ -> Just (a, b) | ||
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 76c2f4f..bcee278 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -9,9 +9,6 @@ module Pipes.Text.Internal | |||
9 | ( Decoding(..) | 9 | ( Decoding(..) |
10 | , streamDecodeUtf8 | 10 | , streamDecodeUtf8 |
11 | , decodeSomeUtf8 | 11 | , decodeSomeUtf8 |
12 | , Codec(..) | ||
13 | , TextException(..) | ||
14 | , utf8 | ||
15 | ) where | 12 | ) where |
16 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | 13 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) |
17 | import Control.Monad.ST (ST, runST) | 14 | import Control.Monad.ST (ST, runST) |
@@ -43,215 +40,6 @@ import Data.Maybe (catMaybes) | |||
43 | #include "pipes_text_cbits.h" | 40 | #include "pipes_text_cbits.h" |
44 | 41 | ||
45 | 42 | ||
46 | -- | A specific character encoding. | ||
47 | -- | ||
48 | -- Since 0.3.0 | ||
49 | data Codec = Codec | ||
50 | { codecName :: Text | ||
51 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | ||
52 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | ||
53 | } | ||
54 | |||
55 | instance Show Codec where | ||
56 | showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) | ||
57 | |||
58 | -- Since 0.3.0 | ||
59 | data TextException = DecodeException Codec Word8 | ||
60 | | EncodeException Codec Char | ||
61 | | LengthExceeded Int | ||
62 | | TextException Exc.SomeException | ||
63 | deriving (Show, Typeable) | ||
64 | instance Exc.Exception TextException | ||
65 | |||
66 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | ||
67 | -> (ByteString -> Decoding) | ||
68 | toDecoding op = loop B.empty where | ||
69 | loop extra bs0 = case op (B.append extra bs0) of | ||
70 | (txt, Right bs) -> Some txt bs (loop bs) | ||
71 | (txt, Left (_,bs)) -> Other txt bs | ||
72 | |||
73 | |||
74 | splitSlowly :: (ByteString -> Text) | ||
75 | -> ByteString | ||
76 | -> (Text, Either (TextException, ByteString) ByteString) | ||
77 | splitSlowly dec bytes = valid where | ||
78 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | ||
79 | splits 0 = [(B.empty, bytes)] | ||
80 | splits n = B.splitAt n bytes : splits (n - 1) | ||
81 | decFirst (a, b) = case tryEvaluate (dec a) of | ||
82 | Left _ -> Nothing | ||
83 | Right text -> let trouble = case tryEvaluate (dec b) of | ||
84 | Left exc -> Left (TextException exc, b) | ||
85 | Right _ -> Right B.empty | ||
86 | in Just (text, trouble) | ||
87 | -- this case shouldn't occur, | ||
88 | -- since splitSlowly is only called | ||
89 | -- when parsing failed somewhere | ||
90 | |||
91 | utf8 :: Codec | ||
92 | utf8 = Codec name enc (toDecoding dec) where | ||
93 | name = T.pack "UTF-8" | ||
94 | enc text = (TE.encodeUtf8 text, Nothing) | ||
95 | dec bytes = case decodeSomeUtf8 bytes of | ||
96 | (t,b) -> (t, Right b) | ||
97 | |||
98 | -- -- Whether the given byte is a continuation byte. | ||
99 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | ||
100 | -- | ||
101 | -- -- The number of continuation bytes needed by the given | ||
102 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | ||
103 | -- -- non-continuation byte and the whole split quickly must fail so | ||
104 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | ||
105 | -- -- suitable error. | ||
106 | -- required x0 | ||
107 | -- | x0 .&. 0x80 == 0x00 = 0 | ||
108 | -- | x0 .&. 0xE0 == 0xC0 = 1 | ||
109 | -- | x0 .&. 0xF0 == 0xE0 = 2 | ||
110 | -- | x0 .&. 0xF8 == 0xF0 = 3 | ||
111 | -- | otherwise = -1 | ||
112 | -- | ||
113 | -- splitQuickly bytes | ||
114 | -- | B.null l || req == -1 = Nothing | ||
115 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | ||
116 | -- | otherwise = Just (TE.decodeUtf8 l', r') | ||
117 | -- where | ||
118 | -- (l, r) = B.spanEnd isContinuation bytes | ||
119 | -- req = required (B.last l) | ||
120 | -- l' = B.init l | ||
121 | -- r' = B.cons (B.last l) r | ||
122 | |||
123 | -- | | ||
124 | -- Since 0.3.0 | ||
125 | utf16_le :: Codec | ||
126 | utf16_le = Codec name enc (toDecoding dec) where | ||
127 | name = T.pack "UTF-16-LE" | ||
128 | enc text = (TE.encodeUtf16LE text, Nothing) | ||
129 | dec bytes = case splitQuickly bytes of | ||
130 | Just (text, extra) -> (text, Right extra) | ||
131 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | ||
132 | |||
133 | splitQuickly bytes = maybeDecode (loop 0) where | ||
134 | maxN = B.length bytes | ||
135 | |||
136 | loop n | n == maxN = decodeAll | ||
137 | | (n + 1) == maxN = decodeTo n | ||
138 | loop n = let | ||
139 | req = utf16Required | ||
140 | (B.index bytes n) | ||
141 | (B.index bytes (n + 1)) | ||
142 | decodeMore = loop $! n + req | ||
143 | in if n + req > maxN | ||
144 | then decodeTo n | ||
145 | else decodeMore | ||
146 | |||
147 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | ||
148 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | ||
149 | |||
150 | -- | | ||
151 | -- Since 0.3.0 | ||
152 | utf16_be :: Codec | ||
153 | utf16_be = Codec name enc (toDecoding dec) where | ||
154 | name = T.pack "UTF-16-BE" | ||
155 | enc text = (TE.encodeUtf16BE text, Nothing) | ||
156 | dec bytes = case splitQuickly bytes of | ||
157 | Just (text, extra) -> (text, Right extra) | ||
158 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | ||
159 | |||
160 | splitQuickly bytes = maybeDecode (loop 0) where | ||
161 | maxN = B.length bytes | ||
162 | |||
163 | loop n | n == maxN = decodeAll | ||
164 | | (n + 1) == maxN = decodeTo n | ||
165 | loop n = let | ||
166 | req = utf16Required | ||
167 | (B.index bytes (n + 1)) | ||
168 | (B.index bytes n) | ||
169 | decodeMore = loop $! n + req | ||
170 | in if n + req > maxN | ||
171 | then decodeTo n | ||
172 | else decodeMore | ||
173 | |||
174 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | ||
175 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | ||
176 | |||
177 | utf16Required :: Word8 -> Word8 -> Int | ||
178 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | ||
179 | x :: Word16 | ||
180 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | ||
181 | |||
182 | -- | | ||
183 | -- Since 0.3.0 | ||
184 | utf32_le :: Codec | ||
185 | utf32_le = Codec name enc (toDecoding dec) where | ||
186 | name = T.pack "UTF-32-LE" | ||
187 | enc text = (TE.encodeUtf32LE text, Nothing) | ||
188 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | ||
189 | Just (text, extra) -> (text, Right extra) | ||
190 | Nothing -> splitSlowly TE.decodeUtf32LE bs | ||
191 | |||
192 | -- | | ||
193 | -- Since 0.3.0 | ||
194 | utf32_be :: Codec | ||
195 | utf32_be = Codec name enc (toDecoding dec) where | ||
196 | name = T.pack "UTF-32-BE" | ||
197 | enc text = (TE.encodeUtf32BE text, Nothing) | ||
198 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | ||
199 | Just (text, extra) -> (text, Right extra) | ||
200 | Nothing -> splitSlowly TE.decodeUtf32BE bs | ||
201 | |||
202 | utf32SplitBytes :: (ByteString -> Text) | ||
203 | -> ByteString | ||
204 | -> Maybe (Text, ByteString) | ||
205 | utf32SplitBytes dec bytes = split where | ||
206 | split = maybeDecode (dec toDecode, extra) | ||
207 | len = B.length bytes | ||
208 | lenExtra = mod len 4 | ||
209 | |||
210 | lenToDecode = len - lenExtra | ||
211 | (toDecode, extra) = if lenExtra == 0 | ||
212 | then (bytes, B.empty) | ||
213 | else B.splitAt lenToDecode bytes | ||
214 | |||
215 | -- | | ||
216 | -- Since 0.3.0 | ||
217 | ascii :: Codec | ||
218 | ascii = Codec name enc (toDecoding dec) where | ||
219 | name = T.pack "ASCII" | ||
220 | enc text = (bytes, extra) where | ||
221 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
222 | bytes = B8.pack (T.unpack safe) | ||
223 | extra = if T.null unsafe | ||
224 | then Nothing | ||
225 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
226 | |||
227 | dec bytes = (text, extra) where | ||
228 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
229 | text = T.pack (B8.unpack safe) | ||
230 | extra = if B.null unsafe | ||
231 | then Right B.empty | ||
232 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
233 | |||
234 | -- | | ||
235 | -- Since 0.3.0 | ||
236 | iso8859_1 :: Codec | ||
237 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
238 | name = T.pack "ISO-8859-1" | ||
239 | enc text = (bytes, extra) where | ||
240 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
241 | bytes = B8.pack (T.unpack safe) | ||
242 | extra = if T.null unsafe | ||
243 | then Nothing | ||
244 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
245 | |||
246 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
247 | |||
248 | tryEvaluate :: a -> Either Exc.SomeException a | ||
249 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | ||
250 | |||
251 | maybeDecode :: (a, b) -> Maybe (a, b) | ||
252 | maybeDecode (a, b) = case tryEvaluate a of | ||
253 | Left _ -> Nothing | ||
254 | Right _ -> Just (a, b) | ||
255 | 43 | ||
256 | -- | A stream oriented decoding result. | 44 | -- | A stream oriented decoding result. |
257 | data Decoding = Some Text ByteString (ByteString -> Decoding) | 45 | data Decoding = Some Text ByteString (ByteString -> Decoding) |
diff --git a/pipes-text.cabal b/pipes-text.cabal index 33cbab4..44bc551 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -14,7 +14,7 @@ cabal-version: >=1.10 | |||
14 | library | 14 | library |
15 | c-sources: cbits/cbits.c | 15 | c-sources: cbits/cbits.c |
16 | include-dirs: include | 16 | include-dirs: include |
17 | exposed-modules: Pipes.Text, Pipes.Text.Internal | 17 | exposed-modules: Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec |
18 | -- other-modules: | 18 | -- other-modules: |
19 | other-extensions: RankNTypes | 19 | other-extensions: RankNTypes |
20 | build-depends: base >= 4 && < 5 , | 20 | build-depends: base >= 4 && < 5 , |