]>
Commit | Line | Data |
---|---|---|
64e03122 | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} |
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | |
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | |
4 | ||
c9d1c945 | 5 | -- This module lifts assorted materials from Brian O'Sullivan's text package |
8c482809 | 6 | -- especially Data.Text.Encoding in order to define a pipes-appropriate |
7 | -- streamDecodeUtf8 | |
8 | module Pipes.Text.Internal | |
9 | ( Decoding(..) | |
8c482809 | 10 | , streamDecodeUtf8 |
3694350a | 11 | , decodeSomeUtf8 |
64e03122 | 12 | , Codec(..) |
13 | , TextException(..) | |
14 | , utf8 | |
8c482809 | 15 | ) where |
8c482809 | 16 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) |
17 | import Control.Monad.ST (ST, runST) | |
8c482809 | 18 | import Data.Bits ((.&.)) |
3694350a | 19 | import Data.ByteString as B |
64e03122 | 20 | import Data.ByteString (ByteString) |
3694350a | 21 | import Data.ByteString.Internal as B |
64e03122 | 22 | import Data.ByteString.Char8 as B8 |
23 | import Data.Text (Text) | |
24 | import qualified Data.Text as T | |
25 | import qualified Data.Text.Encoding as TE | |
3694350a | 26 | import Data.Text.Encoding.Error () |
27 | import Data.Text.Internal (Text, textP) | |
8c482809 | 28 | import Foreign.C.Types (CSize) |
29 | import Foreign.ForeignPtr (withForeignPtr) | |
30 | import Foreign.Marshal.Utils (with) | |
31 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | |
32 | import Foreign.Storable (Storable, peek, poke) | |
3694350a | 33 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) |
34 | import GHC.Word (Word8, Word32) | |
8c482809 | 35 | import qualified Data.Text.Array as A |
64e03122 | 36 | import Data.Word (Word8, Word16) |
37 | import System.IO.Unsafe (unsafePerformIO) | |
38 | import qualified Control.Exception as Exc | |
39 | import Data.Bits ((.&.), (.|.), shiftL) | |
40 | import Data.Typeable | |
41 | import Control.Arrow (first) | |
42 | import Data.Maybe (catMaybes) | |
8c482809 | 43 | #include "pipes_text_cbits.h" |
44 | ||
64e03122 | 45 | |
46 | -- | A specific character encoding. | |
47 | -- | |
48 | -- Since 0.3.0 | |
49 | data Codec = Codec | |
50 | { codecName :: Text | |
51 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | |
52 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | |
53 | } | |
54 | ||
55 | instance Show Codec where | |
56 | showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) | |
57 | ||
58 | -- Since 0.3.0 | |
59 | data TextException = DecodeException Codec Word8 | |
60 | | EncodeException Codec Char | |
61 | | LengthExceeded Int | |
62 | | TextException Exc.SomeException | |
63 | deriving (Show, Typeable) | |
64 | instance Exc.Exception TextException | |
65 | ||
66 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | |
67 | -> (ByteString -> Decoding) | |
68 | toDecoding op = loop B.empty where | |
69 | loop extra bs0 = case op (B.append extra bs0) of | |
70 | (txt, Right bs) -> Some txt bs (loop bs) | |
71 | (txt, Left (_,bs)) -> Other txt bs | |
72 | ||
73 | ||
74 | splitSlowly :: (ByteString -> Text) | |
75 | -> ByteString | |
76 | -> (Text, Either (TextException, ByteString) ByteString) | |
77 | splitSlowly dec bytes = valid where | |
78 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | |
79 | splits 0 = [(B.empty, bytes)] | |
80 | splits n = B.splitAt n bytes : splits (n - 1) | |
81 | decFirst (a, b) = case tryEvaluate (dec a) of | |
82 | Left _ -> Nothing | |
83 | Right text -> let trouble = case tryEvaluate (dec b) of | |
84 | Left exc -> Left (TextException exc, b) | |
85 | Right _ -> Right B.empty | |
86 | in Just (text, trouble) | |
87 | -- this case shouldn't occur, | |
88 | -- since splitSlowly is only called | |
89 | -- when parsing failed somewhere | |
90 | ||
91 | utf8 :: Codec | |
92 | utf8 = Codec name enc (toDecoding dec) where | |
93 | name = T.pack "UTF-8" | |
94 | enc text = (TE.encodeUtf8 text, Nothing) | |
95 | dec bytes = case decodeSomeUtf8 bytes of | |
96 | (t,b) -> (t, Right b) | |
97 | ||
98 | -- -- Whether the given byte is a continuation byte. | |
99 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | |
100 | -- | |
101 | -- -- The number of continuation bytes needed by the given | |
102 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | |
103 | -- -- non-continuation byte and the whole split quickly must fail so | |
104 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | |
105 | -- -- suitable error. | |
106 | -- required x0 | |
107 | -- | x0 .&. 0x80 == 0x00 = 0 | |
108 | -- | x0 .&. 0xE0 == 0xC0 = 1 | |
109 | -- | x0 .&. 0xF0 == 0xE0 = 2 | |
110 | -- | x0 .&. 0xF8 == 0xF0 = 3 | |
111 | -- | otherwise = -1 | |
112 | -- | |
113 | -- splitQuickly bytes | |
114 | -- | B.null l || req == -1 = Nothing | |
115 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | |
116 | -- | otherwise = Just (TE.decodeUtf8 l', r') | |
117 | -- where | |
118 | -- (l, r) = B.spanEnd isContinuation bytes | |
119 | -- req = required (B.last l) | |
120 | -- l' = B.init l | |
121 | -- r' = B.cons (B.last l) r | |
122 | ||
123 | -- | | |
124 | -- Since 0.3.0 | |
125 | utf16_le :: Codec | |
126 | utf16_le = Codec name enc (toDecoding dec) where | |
127 | name = T.pack "UTF-16-LE" | |
128 | enc text = (TE.encodeUtf16LE text, Nothing) | |
129 | dec bytes = case splitQuickly bytes of | |
130 | Just (text, extra) -> (text, Right extra) | |
131 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | |
132 | ||
133 | splitQuickly bytes = maybeDecode (loop 0) where | |
134 | maxN = B.length bytes | |
135 | ||
136 | loop n | n == maxN = decodeAll | |
137 | | (n + 1) == maxN = decodeTo n | |
138 | loop n = let | |
139 | req = utf16Required | |
140 | (B.index bytes n) | |
141 | (B.index bytes (n + 1)) | |
142 | decodeMore = loop $! n + req | |
143 | in if n + req > maxN | |
144 | then decodeTo n | |
145 | else decodeMore | |
146 | ||
147 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | |
148 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | |
149 | ||
150 | -- | | |
151 | -- Since 0.3.0 | |
152 | utf16_be :: Codec | |
153 | utf16_be = Codec name enc (toDecoding dec) where | |
154 | name = T.pack "UTF-16-BE" | |
155 | enc text = (TE.encodeUtf16BE text, Nothing) | |
156 | dec bytes = case splitQuickly bytes of | |
157 | Just (text, extra) -> (text, Right extra) | |
158 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | |
159 | ||
160 | splitQuickly bytes = maybeDecode (loop 0) where | |
161 | maxN = B.length bytes | |
162 | ||
163 | loop n | n == maxN = decodeAll | |
164 | | (n + 1) == maxN = decodeTo n | |
165 | loop n = let | |
166 | req = utf16Required | |
167 | (B.index bytes (n + 1)) | |
168 | (B.index bytes n) | |
169 | decodeMore = loop $! n + req | |
170 | in if n + req > maxN | |
171 | then decodeTo n | |
172 | else decodeMore | |
173 | ||
174 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | |
175 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | |
176 | ||
177 | utf16Required :: Word8 -> Word8 -> Int | |
178 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | |
179 | x :: Word16 | |
180 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | |
181 | ||
182 | -- | | |
183 | -- Since 0.3.0 | |
184 | utf32_le :: Codec | |
185 | utf32_le = Codec name enc (toDecoding dec) where | |
186 | name = T.pack "UTF-32-LE" | |
187 | enc text = (TE.encodeUtf32LE text, Nothing) | |
188 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | |
189 | Just (text, extra) -> (text, Right extra) | |
190 | Nothing -> splitSlowly TE.decodeUtf32LE bs | |
191 | ||
192 | -- | | |
193 | -- Since 0.3.0 | |
194 | utf32_be :: Codec | |
195 | utf32_be = Codec name enc (toDecoding dec) where | |
196 | name = T.pack "UTF-32-BE" | |
197 | enc text = (TE.encodeUtf32BE text, Nothing) | |
198 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | |
199 | Just (text, extra) -> (text, Right extra) | |
200 | Nothing -> splitSlowly TE.decodeUtf32BE bs | |
201 | ||
202 | utf32SplitBytes :: (ByteString -> Text) | |
203 | -> ByteString | |
204 | -> Maybe (Text, ByteString) | |
205 | utf32SplitBytes dec bytes = split where | |
206 | split = maybeDecode (dec toDecode, extra) | |
207 | len = B.length bytes | |
208 | lenExtra = mod len 4 | |
209 | ||
210 | lenToDecode = len - lenExtra | |
211 | (toDecode, extra) = if lenExtra == 0 | |
212 | then (bytes, B.empty) | |
213 | else B.splitAt lenToDecode bytes | |
214 | ||
215 | -- | | |
216 | -- Since 0.3.0 | |
217 | ascii :: Codec | |
218 | ascii = Codec name enc (toDecoding dec) where | |
219 | name = T.pack "ASCII" | |
220 | enc text = (bytes, extra) where | |
221 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | |
222 | bytes = B8.pack (T.unpack safe) | |
223 | extra = if T.null unsafe | |
224 | then Nothing | |
225 | else Just (EncodeException ascii (T.head unsafe), unsafe) | |
226 | ||
227 | dec bytes = (text, extra) where | |
228 | (safe, unsafe) = B.span (<= 0x7F) bytes | |
229 | text = T.pack (B8.unpack safe) | |
230 | extra = if B.null unsafe | |
231 | then Right B.empty | |
232 | else Left (DecodeException ascii (B.head unsafe), unsafe) | |
233 | ||
234 | -- | | |
235 | -- Since 0.3.0 | |
236 | iso8859_1 :: Codec | |
237 | iso8859_1 = Codec name enc (toDecoding dec) where | |
238 | name = T.pack "ISO-8859-1" | |
239 | enc text = (bytes, extra) where | |
240 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | |
241 | bytes = B8.pack (T.unpack safe) | |
242 | extra = if T.null unsafe | |
243 | then Nothing | |
244 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | |
245 | ||
246 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | |
247 | ||
248 | tryEvaluate :: a -> Either Exc.SomeException a | |
249 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | |
250 | ||
251 | maybeDecode :: (a, b) -> Maybe (a, b) | |
252 | maybeDecode (a, b) = case tryEvaluate a of | |
253 | Left _ -> Nothing | |
254 | Right _ -> Just (a, b) | |
255 | ||
8c482809 | 256 | -- | A stream oriented decoding result. |
257 | data Decoding = Some Text ByteString (ByteString -> Decoding) | |
258 | | Other Text ByteString | |
259 | instance Show Decoding where | |
260 | showsPrec d (Some t bs _) = showParen (d > prec) $ | |
261 | showString "Some " . showsPrec prec' t . | |
262 | showChar ' ' . showsPrec prec' bs . | |
263 | showString " _" | |
264 | where prec = 10; prec' = prec + 1 | |
265 | showsPrec d (Other t bs) = showParen (d > prec) $ | |
266 | showString "Other " . showsPrec prec' t . | |
267 | showChar ' ' . showsPrec prec' bs . | |
268 | showString " _" | |
269 | where prec = 10; prec' = prec + 1 | |
3694350a | 270 | |
8c482809 | 271 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) |
272 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | |
273 | ||
8c482809 | 274 | streamDecodeUtf8 :: ByteString -> Decoding |
3694350a | 275 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 |
276 | where | |
277 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | |
278 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | |
279 | runST $ do marray <- A.new (len+1) | |
280 | unsafeIOToST (decodeChunkToBuffer marray) | |
281 | where | |
282 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | |
283 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | |
284 | with (0::CSize) $ \destOffPtr -> | |
285 | with codepoint0 $ \codepointPtr -> | |
286 | with state0 $ \statePtr -> | |
287 | with nullPtr $ \curPtrPtr -> | |
288 | do let end = ptr `plusPtr` (off + len) | |
289 | curPtr = ptr `plusPtr` off | |
290 | poke curPtrPtr curPtr | |
291 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | |
292 | state <- peek statePtr | |
293 | lastPtr <- peek curPtrPtr | |
294 | codepoint <- peek codepointPtr | |
295 | n <- peek destOffPtr | |
296 | chunkText <- mkText dest n | |
297 | let left = lastPtr `minusPtr` curPtr | |
298 | remaining = B.drop left bs | |
299 | accum = if T.null chunkText then B.append old remaining else remaining | |
300 | return $! case state of | |
301 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | |
302 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | |
303 | {-# INLINE decodeChunkToBuffer #-} | |
304 | {-# INLINE decodeChunkUtf8 #-} | |
305 | {-# INLINE streamDecodeUtf8 #-} | |
8c482809 | 306 | |
3694350a | 307 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) |
308 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | |
309 | dest <- A.new (len+1) | |
310 | unsafeIOToST $ | |
311 | withForeignPtr fp $ \ptr -> | |
312 | with (0::CSize) $ \destOffPtr -> | |
313 | with (0::CodePoint) $ \codepointPtr -> | |
314 | with (0::DecoderState) $ \statePtr -> | |
315 | with nullPtr $ \curPtrPtr -> | |
c9d1c945 | 316 | do let end = ptr `plusPtr` (off + len) |
317 | curPtr = ptr `plusPtr` off | |
318 | poke curPtrPtr curPtr | |
3694350a | 319 | c_decode_utf8_with_state (A.maBA dest) destOffPtr |
320 | curPtrPtr end codepointPtr statePtr | |
c9d1c945 | 321 | state <- peek statePtr |
322 | lastPtr <- peek curPtrPtr | |
323 | codepoint <- peek codepointPtr | |
324 | n <- peek destOffPtr | |
3694350a | 325 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest |
326 | return $! textP arr 0 (fromIntegral n) | |
c9d1c945 | 327 | let left = lastPtr `minusPtr` curPtr |
328 | remaining = B.drop left bs | |
3694350a | 329 | return $! (chunkText, remaining) |
330 | {-# INLINE decodeSomeUtf8 #-} | |
331 | ||
c9d1c945 | 332 | mkText :: A.MArray s -> CSize -> IO Text |
333 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | |
334 | return $! textP arr 0 (fromIntegral n) | |
3694350a | 335 | {-# INLINE mkText #-} |
8c482809 | 336 | |
337 | ord :: Char -> Int | |
338 | ord (C# c#) = I# (ord# c#) | |
339 | {-# INLINE ord #-} | |
340 | ||
8c482809 | 341 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int |
342 | unsafeWrite marr i c | |
c9d1c945 | 343 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) |
344 | return 1 | |
345 | | otherwise = do A.unsafeWrite marr i lo | |
346 | A.unsafeWrite marr (i+1) hi | |
347 | return 2 | |
8c482809 | 348 | where n = ord c |
349 | m = n - 0x10000 | |
350 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | |
351 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | |
352 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | |
3694350a | 353 | {-# INLINE shiftR #-} |
8c482809 | 354 | {-# INLINE unsafeWrite #-} |
355 | ||
356 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | |
357 | :: MutableByteArray# s -> Ptr CSize | |
358 | -> Ptr (Ptr Word8) -> Ptr Word8 | |
359 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) |