diff options
Diffstat (limited to 'Pipes/Text/Internal.hs')
-rw-r--r-- | Pipes/Text/Internal.hs | 264 |
1 files changed, 230 insertions, 34 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 7e5b044..76c2f4f 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -1,5 +1,7 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} |
2 | UnliftedFFITypes #-} | 2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} |
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
3 | -- This module lifts assorted materials from Brian O'Sullivan's text package | 5 | -- This module lifts assorted materials from Brian O'Sullivan's text package |
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | 6 | -- especially Data.Text.Encoding in order to define a pipes-appropriate |
5 | -- streamDecodeUtf8 | 7 | -- streamDecodeUtf8 |
@@ -7,13 +9,20 @@ module Pipes.Text.Internal | |||
7 | ( Decoding(..) | 9 | ( Decoding(..) |
8 | , streamDecodeUtf8 | 10 | , streamDecodeUtf8 |
9 | , decodeSomeUtf8 | 11 | , decodeSomeUtf8 |
12 | , Codec(..) | ||
13 | , TextException(..) | ||
14 | , utf8 | ||
10 | ) where | 15 | ) where |
11 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | 16 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) |
12 | import Control.Monad.ST (ST, runST) | 17 | import Control.Monad.ST (ST, runST) |
13 | import Data.Bits ((.&.)) | 18 | import Data.Bits ((.&.)) |
14 | import Data.ByteString as B | 19 | import Data.ByteString as B |
20 | import Data.ByteString (ByteString) | ||
15 | import Data.ByteString.Internal as B | 21 | import Data.ByteString.Internal as B |
16 | import qualified Data.Text as T (null) | 22 | import Data.ByteString.Char8 as B8 |
23 | import Data.Text (Text) | ||
24 | import qualified Data.Text as T | ||
25 | import qualified Data.Text.Encoding as TE | ||
17 | import Data.Text.Encoding.Error () | 26 | import Data.Text.Encoding.Error () |
18 | import Data.Text.Internal (Text, textP) | 27 | import Data.Text.Internal (Text, textP) |
19 | import Foreign.C.Types (CSize) | 28 | import Foreign.C.Types (CSize) |
@@ -24,9 +33,226 @@ import Foreign.Storable (Storable, peek, poke) | |||
24 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | 33 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) |
25 | import GHC.Word (Word8, Word32) | 34 | import GHC.Word (Word8, Word32) |
26 | import qualified Data.Text.Array as A | 35 | import qualified Data.Text.Array as A |
27 | 36 | import Data.Word (Word8, Word16) | |
37 | import System.IO.Unsafe (unsafePerformIO) | ||
38 | import qualified Control.Exception as Exc | ||
39 | import Data.Bits ((.&.), (.|.), shiftL) | ||
40 | import Data.Typeable | ||
41 | import Control.Arrow (first) | ||
42 | import Data.Maybe (catMaybes) | ||
28 | #include "pipes_text_cbits.h" | 43 | #include "pipes_text_cbits.h" |
29 | 44 | ||
45 | |||
46 | -- | A specific character encoding. | ||
47 | -- | ||
48 | -- Since 0.3.0 | ||
49 | data Codec = Codec | ||
50 | { codecName :: Text | ||
51 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | ||
52 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | ||
53 | } | ||
54 | |||
55 | instance Show Codec where | ||
56 | showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) | ||
57 | |||
58 | -- Since 0.3.0 | ||
59 | data TextException = DecodeException Codec Word8 | ||
60 | | EncodeException Codec Char | ||
61 | | LengthExceeded Int | ||
62 | | TextException Exc.SomeException | ||
63 | deriving (Show, Typeable) | ||
64 | instance Exc.Exception TextException | ||
65 | |||
66 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | ||
67 | -> (ByteString -> Decoding) | ||
68 | toDecoding op = loop B.empty where | ||
69 | loop extra bs0 = case op (B.append extra bs0) of | ||
70 | (txt, Right bs) -> Some txt bs (loop bs) | ||
71 | (txt, Left (_,bs)) -> Other txt bs | ||
72 | |||
73 | |||
74 | splitSlowly :: (ByteString -> Text) | ||
75 | -> ByteString | ||
76 | -> (Text, Either (TextException, ByteString) ByteString) | ||
77 | splitSlowly dec bytes = valid where | ||
78 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | ||
79 | splits 0 = [(B.empty, bytes)] | ||
80 | splits n = B.splitAt n bytes : splits (n - 1) | ||
81 | decFirst (a, b) = case tryEvaluate (dec a) of | ||
82 | Left _ -> Nothing | ||
83 | Right text -> let trouble = case tryEvaluate (dec b) of | ||
84 | Left exc -> Left (TextException exc, b) | ||
85 | Right _ -> Right B.empty | ||
86 | in Just (text, trouble) | ||
87 | -- this case shouldn't occur, | ||
88 | -- since splitSlowly is only called | ||
89 | -- when parsing failed somewhere | ||
90 | |||
91 | utf8 :: Codec | ||
92 | utf8 = Codec name enc (toDecoding dec) where | ||
93 | name = T.pack "UTF-8" | ||
94 | enc text = (TE.encodeUtf8 text, Nothing) | ||
95 | dec bytes = case decodeSomeUtf8 bytes of | ||
96 | (t,b) -> (t, Right b) | ||
97 | |||
98 | -- -- Whether the given byte is a continuation byte. | ||
99 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | ||
100 | -- | ||
101 | -- -- The number of continuation bytes needed by the given | ||
102 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | ||
103 | -- -- non-continuation byte and the whole split quickly must fail so | ||
104 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | ||
105 | -- -- suitable error. | ||
106 | -- required x0 | ||
107 | -- | x0 .&. 0x80 == 0x00 = 0 | ||
108 | -- | x0 .&. 0xE0 == 0xC0 = 1 | ||
109 | -- | x0 .&. 0xF0 == 0xE0 = 2 | ||
110 | -- | x0 .&. 0xF8 == 0xF0 = 3 | ||
111 | -- | otherwise = -1 | ||
112 | -- | ||
113 | -- splitQuickly bytes | ||
114 | -- | B.null l || req == -1 = Nothing | ||
115 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | ||
116 | -- | otherwise = Just (TE.decodeUtf8 l', r') | ||
117 | -- where | ||
118 | -- (l, r) = B.spanEnd isContinuation bytes | ||
119 | -- req = required (B.last l) | ||
120 | -- l' = B.init l | ||
121 | -- r' = B.cons (B.last l) r | ||
122 | |||
123 | -- | | ||
124 | -- Since 0.3.0 | ||
125 | utf16_le :: Codec | ||
126 | utf16_le = Codec name enc (toDecoding dec) where | ||
127 | name = T.pack "UTF-16-LE" | ||
128 | enc text = (TE.encodeUtf16LE text, Nothing) | ||
129 | dec bytes = case splitQuickly bytes of | ||
130 | Just (text, extra) -> (text, Right extra) | ||
131 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | ||
132 | |||
133 | splitQuickly bytes = maybeDecode (loop 0) where | ||
134 | maxN = B.length bytes | ||
135 | |||
136 | loop n | n == maxN = decodeAll | ||
137 | | (n + 1) == maxN = decodeTo n | ||
138 | loop n = let | ||
139 | req = utf16Required | ||
140 | (B.index bytes n) | ||
141 | (B.index bytes (n + 1)) | ||
142 | decodeMore = loop $! n + req | ||
143 | in if n + req > maxN | ||
144 | then decodeTo n | ||
145 | else decodeMore | ||
146 | |||
147 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | ||
148 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | ||
149 | |||
150 | -- | | ||
151 | -- Since 0.3.0 | ||
152 | utf16_be :: Codec | ||
153 | utf16_be = Codec name enc (toDecoding dec) where | ||
154 | name = T.pack "UTF-16-BE" | ||
155 | enc text = (TE.encodeUtf16BE text, Nothing) | ||
156 | dec bytes = case splitQuickly bytes of | ||
157 | Just (text, extra) -> (text, Right extra) | ||
158 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | ||
159 | |||
160 | splitQuickly bytes = maybeDecode (loop 0) where | ||
161 | maxN = B.length bytes | ||
162 | |||
163 | loop n | n == maxN = decodeAll | ||
164 | | (n + 1) == maxN = decodeTo n | ||
165 | loop n = let | ||
166 | req = utf16Required | ||
167 | (B.index bytes (n + 1)) | ||
168 | (B.index bytes n) | ||
169 | decodeMore = loop $! n + req | ||
170 | in if n + req > maxN | ||
171 | then decodeTo n | ||
172 | else decodeMore | ||
173 | |||
174 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | ||
175 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | ||
176 | |||
177 | utf16Required :: Word8 -> Word8 -> Int | ||
178 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | ||
179 | x :: Word16 | ||
180 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | ||
181 | |||
182 | -- | | ||
183 | -- Since 0.3.0 | ||
184 | utf32_le :: Codec | ||
185 | utf32_le = Codec name enc (toDecoding dec) where | ||
186 | name = T.pack "UTF-32-LE" | ||
187 | enc text = (TE.encodeUtf32LE text, Nothing) | ||
188 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | ||
189 | Just (text, extra) -> (text, Right extra) | ||
190 | Nothing -> splitSlowly TE.decodeUtf32LE bs | ||
191 | |||
192 | -- | | ||
193 | -- Since 0.3.0 | ||
194 | utf32_be :: Codec | ||
195 | utf32_be = Codec name enc (toDecoding dec) where | ||
196 | name = T.pack "UTF-32-BE" | ||
197 | enc text = (TE.encodeUtf32BE text, Nothing) | ||
198 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | ||
199 | Just (text, extra) -> (text, Right extra) | ||
200 | Nothing -> splitSlowly TE.decodeUtf32BE bs | ||
201 | |||
202 | utf32SplitBytes :: (ByteString -> Text) | ||
203 | -> ByteString | ||
204 | -> Maybe (Text, ByteString) | ||
205 | utf32SplitBytes dec bytes = split where | ||
206 | split = maybeDecode (dec toDecode, extra) | ||
207 | len = B.length bytes | ||
208 | lenExtra = mod len 4 | ||
209 | |||
210 | lenToDecode = len - lenExtra | ||
211 | (toDecode, extra) = if lenExtra == 0 | ||
212 | then (bytes, B.empty) | ||
213 | else B.splitAt lenToDecode bytes | ||
214 | |||
215 | -- | | ||
216 | -- Since 0.3.0 | ||
217 | ascii :: Codec | ||
218 | ascii = Codec name enc (toDecoding dec) where | ||
219 | name = T.pack "ASCII" | ||
220 | enc text = (bytes, extra) where | ||
221 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
222 | bytes = B8.pack (T.unpack safe) | ||
223 | extra = if T.null unsafe | ||
224 | then Nothing | ||
225 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
226 | |||
227 | dec bytes = (text, extra) where | ||
228 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
229 | text = T.pack (B8.unpack safe) | ||
230 | extra = if B.null unsafe | ||
231 | then Right B.empty | ||
232 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
233 | |||
234 | -- | | ||
235 | -- Since 0.3.0 | ||
236 | iso8859_1 :: Codec | ||
237 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
238 | name = T.pack "ISO-8859-1" | ||
239 | enc text = (bytes, extra) where | ||
240 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
241 | bytes = B8.pack (T.unpack safe) | ||
242 | extra = if T.null unsafe | ||
243 | then Nothing | ||
244 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
245 | |||
246 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
247 | |||
248 | tryEvaluate :: a -> Either Exc.SomeException a | ||
249 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | ||
250 | |||
251 | maybeDecode :: (a, b) -> Maybe (a, b) | ||
252 | maybeDecode (a, b) = case tryEvaluate a of | ||
253 | Left _ -> Nothing | ||
254 | Right _ -> Just (a, b) | ||
255 | |||
30 | -- | A stream oriented decoding result. | 256 | -- | A stream oriented decoding result. |
31 | data Decoding = Some Text ByteString (ByteString -> Decoding) | 257 | data Decoding = Some Text ByteString (ByteString -> Decoding) |
32 | | Other Text ByteString | 258 | | Other Text ByteString |
@@ -103,36 +329,6 @@ decodeSomeUtf8 bs@(PS fp off len) = runST $ do | |||
103 | return $! (chunkText, remaining) | 329 | return $! (chunkText, remaining) |
104 | {-# INLINE decodeSomeUtf8 #-} | 330 | {-# INLINE decodeSomeUtf8 #-} |
105 | 331 | ||
106 | -- decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
107 | -- decodeSomeUtf8 bs@(PS fp off len) = | ||
108 | -- runST $ do marray <- A.new (len+1) | ||
109 | -- unsafeIOToST (decodeChunkToBuffer marray) | ||
110 | -- | ||
111 | -- where | ||
112 | -- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) | ||
113 | -- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
114 | -- with (0::CSize) $ \destOffPtr -> | ||
115 | -- with (0::CodePoint) $ \codepointPtr -> | ||
116 | -- with (0::DecoderState) $ \statePtr -> | ||
117 | -- with nullPtr $ \curPtrPtr -> | ||
118 | -- do let end = ptr `plusPtr` (off + len) | ||
119 | -- curPtr = ptr `plusPtr` off | ||
120 | -- poke curPtrPtr curPtr | ||
121 | -- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
122 | -- state <- peek statePtr | ||
123 | -- lastPtr <- peek curPtrPtr | ||
124 | -- codepoint <- peek codepointPtr | ||
125 | -- n <- peek destOffPtr | ||
126 | -- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
127 | -- return $! textP arr 0 (fromIntegral n) | ||
128 | -- let left = lastPtr `minusPtr` curPtr | ||
129 | -- remaining = B.drop left bs | ||
130 | -- return $! (chunkText, remaining) | ||
131 | -- {-# INLINE decodeChunkToBuffer #-} | ||
132 | -- {-# INLINE decodeSomeUtf8 #-} | ||
133 | |||
134 | |||
135 | |||
136 | mkText :: A.MArray s -> CSize -> IO Text | 332 | mkText :: A.MArray s -> CSize -> IO Text |
137 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | 333 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest |
138 | return $! textP arr 0 (fromIntegral n) | 334 | return $! textP arr 0 (fromIntegral n) |