diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-25 22:25:07 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-25 22:25:07 -0500 |
commit | c9d1c945a4343d756533b85060c35c04be0c8b02 (patch) | |
tree | 0c31ae2e13002c1311ae3cc1e15608750c5a0a9c /Pipes/Text | |
parent | 8c48280926efffc0ca52a5d9ca796d639d053379 (diff) | |
download | text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.gz text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.zst text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.zip |
scrap character replacement; simplify
Diffstat (limited to 'Pipes/Text')
-rw-r--r-- | Pipes/Text/Internal.hs | 118 |
1 files changed, 38 insertions, 80 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 05d9887..73d6fa4 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -1,11 +1,10 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, |
2 | UnliftedFFITypes #-} | 2 | UnliftedFFITypes #-} |
3 | -- This module lifts material from Brian O'Sullivan's text package | 3 | -- This module lifts assorted materials from Brian O'Sullivan's text package |
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | 4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate |
5 | -- streamDecodeUtf8 | 5 | -- streamDecodeUtf8 |
6 | module Pipes.Text.Internal | 6 | module Pipes.Text.Internal |
7 | ( Decoding(..) | 7 | ( Decoding(..) |
8 | , streamDecodeUtf8With | ||
9 | , streamDecodeUtf8 | 8 | , streamDecodeUtf8 |
10 | ) where | 9 | ) where |
11 | 10 | ||
@@ -20,6 +19,7 @@ import Data.Bits ((.&.)) | |||
20 | import Data.ByteString as B | 19 | import Data.ByteString as B |
21 | import Data.ByteString.Internal as B | 20 | import Data.ByteString.Internal as B |
22 | import Data.Text () | 21 | import Data.Text () |
22 | import qualified Data.Text as T | ||
23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) | 23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) |
24 | import Data.Text.Internal (Text(..), safe, textP) | 24 | import Data.Text.Internal (Text(..), safe, textP) |
25 | import Data.Word (Word8, Word32) | 25 | import Data.Word (Word8, Word32) |
@@ -56,94 +56,52 @@ instance Show Decoding where | |||
56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | 56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) |
57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | 57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) |
58 | 58 | ||
59 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
60 | -- encoded text that is known to be valid. | ||
61 | -- | ||
62 | -- If the input contains any invalid UTF-8 data, an exception will be | ||
63 | -- thrown (either by this function or a continuation) that cannot be | ||
64 | -- caught in pure code. For more control over the handling of invalid | ||
65 | -- data, use 'streamDecodeUtf8With'. | ||
66 | streamDecodeUtf8 :: ByteString -> Decoding | 59 | streamDecodeUtf8 :: ByteString -> Decoding |
67 | streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) | 60 | streamDecodeUtf8 = decodeChunk B.empty 0 0 |
68 | 61 | ||
69 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | 62 | decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding |
70 | -- encoded text. | 63 | decodeChunk old codepoint0 state0 bs@(PS fp off len) = |
71 | streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding | 64 | runST $ do marray <- A.new (len+1) |
72 | streamDecodeUtf8With mErr = case mErr of | 65 | unsafeIOToST (decodeChunkToBuffer marray) |
73 | Nothing -> decodeWith False strictDecode | ||
74 | Just onErr -> decodeWith True onErr | ||
75 | where | ||
76 | -- We create a slightly larger than necessary buffer to accommodate a | ||
77 | -- potential surrogate pair started in the last buffer | ||
78 | decodeWith replace onErr = decodeChunk 0 0 | ||
79 | where | ||
80 | decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding | ||
81 | decodeChunk codepoint0 state0 bs@(PS fp off len) = | ||
82 | runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) | ||
83 | where | 66 | where |
84 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | 67 | decodeChunkToBuffer :: A.MArray s -> IO Decoding |
85 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | 68 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> |
86 | with (0::CSize) $ \destOffPtr -> | 69 | with (0::CSize) $ \destOffPtr -> |
87 | with codepoint0 $ \codepointPtr -> | 70 | with codepoint0 $ \codepointPtr -> |
88 | with state0 $ \statePtr -> | 71 | with state0 $ \statePtr -> |
89 | with nullPtr $ \curPtrPtr -> | 72 | with nullPtr $ \curPtrPtr -> |
90 | let end = ptr `plusPtr` (off + len) | 73 | do let end = ptr `plusPtr` (off + len) |
91 | loop curPtr = do | 74 | curPtr = ptr `plusPtr` off |
92 | poke curPtrPtr curPtr | 75 | poke curPtrPtr curPtr |
93 | curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr | 76 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr |
94 | curPtrPtr end codepointPtr statePtr | 77 | state <- peek statePtr |
95 | state <- peek statePtr | 78 | lastPtr <- peek curPtrPtr |
96 | case state of | 79 | codepoint <- peek codepointPtr |
97 | UTF8_REJECT -> | 80 | n <- peek destOffPtr |
98 | -- We encountered an encoding error | 81 | chunkText <- mkText dest n |
99 | if replace | 82 | let left = lastPtr `minusPtr` curPtr |
100 | then do | 83 | remaining = B.drop left bs |
101 | x <- peek curPtr' | 84 | accum = if T.null chunkText then B.append old remaining else remaining |
102 | case onErr desc (Just x) of | 85 | return $ case state of |
103 | Nothing -> loop $ curPtr' `plusPtr` 1 | 86 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error |
104 | Just c -> do | 87 | _ -> Some chunkText accum (decodeChunk accum codepoint state) |
105 | destOff <- peek destOffPtr | 88 | |
106 | w <- unsafeSTToIO $ | 89 | |
107 | unsafeWrite dest (fromIntegral destOff) (safe c) | 90 | mkText :: A.MArray s -> CSize -> IO Text |
108 | poke destOffPtr (destOff + fromIntegral w) | 91 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest |
109 | poke statePtr 0 | 92 | return $! textP arr 0 (fromIntegral n) |
110 | loop $ curPtr' `plusPtr` 1 | ||
111 | else do | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do | ||
114 | arr <- A.unsafeFreeze dest | ||
115 | return $! textP arr 0 (fromIntegral n) | ||
116 | lastPtr <- peek curPtrPtr | ||
117 | let left = lastPtr `minusPtr` curPtr | ||
118 | return $ Other chunkText (B.drop left bs) | ||
119 | _ -> do | ||
120 | -- We encountered the end of the buffer while decoding | ||
121 | n <- peek destOffPtr | ||
122 | codepoint <- peek codepointPtr | ||
123 | chunkText <- unsafeSTToIO $ do | ||
124 | arr <- A.unsafeFreeze dest | ||
125 | return $! textP arr 0 (fromIntegral n) | ||
126 | lastPtr <- peek curPtrPtr | ||
127 | let left = lastPtr `minusPtr` curPtr | ||
128 | return $ Some chunkText (B.drop left bs) | ||
129 | (decodeChunk codepoint state) | ||
130 | in loop (ptr `plusPtr` off) | ||
131 | desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" | ||
132 | 93 | ||
133 | ord :: Char -> Int | 94 | ord :: Char -> Int |
134 | ord (C# c#) = I# (ord# c#) | 95 | ord (C# c#) = I# (ord# c#) |
135 | {-# INLINE ord #-} | 96 | {-# INLINE ord #-} |
136 | 97 | ||
137 | |||
138 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | 98 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int |
139 | unsafeWrite marr i c | 99 | unsafeWrite marr i c |
140 | | n < 0x10000 = do | 100 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) |
141 | A.unsafeWrite marr i (fromIntegral n) | 101 | return 1 |
142 | return 1 | 102 | | otherwise = do A.unsafeWrite marr i lo |
143 | | otherwise = do | 103 | A.unsafeWrite marr (i+1) hi |
144 | A.unsafeWrite marr i lo | 104 | return 2 |
145 | A.unsafeWrite marr (i+1) hi | ||
146 | return 2 | ||
147 | where n = ord c | 105 | where n = ord c |
148 | m = n - 0x10000 | 106 | m = n - 0x10000 |
149 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | 107 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 |