aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-25 22:25:07 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-25 22:25:07 -0500
commitc9d1c945a4343d756533b85060c35c04be0c8b02 (patch)
tree0c31ae2e13002c1311ae3cc1e15608750c5a0a9c /Pipes/Text
parent8c48280926efffc0ca52a5d9ca796d639d053379 (diff)
downloadtext-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.gz
text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.zst
text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.zip
scrap character replacement; simplify
Diffstat (limited to 'Pipes/Text')
-rw-r--r--Pipes/Text/Internal.hs118
1 files changed, 38 insertions, 80 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 05d9887..73d6fa4 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -1,11 +1,10 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, 1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-} 2 UnliftedFFITypes #-}
3-- This module lifts material from Brian O'Sullivan's text package 3-- This module lifts assorted materials from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate 4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8 5-- streamDecodeUtf8
6module Pipes.Text.Internal 6module Pipes.Text.Internal
7 ( Decoding(..) 7 ( Decoding(..)
8 , streamDecodeUtf8With
9 , streamDecodeUtf8 8 , streamDecodeUtf8
10 ) where 9 ) where
11 10
@@ -20,6 +19,7 @@ import Data.Bits ((.&.))
20import Data.ByteString as B 19import Data.ByteString as B
21import Data.ByteString.Internal as B 20import Data.ByteString.Internal as B
22import Data.Text () 21import Data.Text ()
22import qualified Data.Text as T
23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) 23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24import Data.Text.Internal (Text(..), safe, textP) 24import Data.Text.Internal (Text(..), safe, textP)
25import Data.Word (Word8, Word32) 25import Data.Word (Word8, Word32)
@@ -56,94 +56,52 @@ instance Show Decoding where
56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) 56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) 57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58 58
59-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60-- encoded text that is known to be valid.
61--
62-- If the input contains any invalid UTF-8 data, an exception will be
63-- thrown (either by this function or a continuation) that cannot be
64-- caught in pure code. For more control over the handling of invalid
65-- data, use 'streamDecodeUtf8With'.
66streamDecodeUtf8 :: ByteString -> Decoding 59streamDecodeUtf8 :: ByteString -> Decoding
67streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) 60streamDecodeUtf8 = decodeChunk B.empty 0 0
68 61
69-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 62decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
70-- encoded text. 63decodeChunk old codepoint0 state0 bs@(PS fp off len) =
71streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding 64 runST $ do marray <- A.new (len+1)
72streamDecodeUtf8With mErr = case mErr of 65 unsafeIOToST (decodeChunkToBuffer marray)
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
75 where
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
79 where
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83 where 66 where
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding 67 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> 68 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr -> 69 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr -> 70 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr -> 71 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr -> 72 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len) 73 do let end = ptr `plusPtr` (off + len)
91 loop curPtr = do 74 curPtr = ptr `plusPtr` off
92 poke curPtrPtr curPtr 75 poke curPtrPtr curPtr
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr 76 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
94 curPtrPtr end codepointPtr statePtr 77 state <- peek statePtr
95 state <- peek statePtr 78 lastPtr <- peek curPtrPtr
96 case state of 79 codepoint <- peek codepointPtr
97 UTF8_REJECT -> 80 n <- peek destOffPtr
98 -- We encountered an encoding error 81 chunkText <- mkText dest n
99 if replace 82 let left = lastPtr `minusPtr` curPtr
100 then do 83 remaining = B.drop left bs
101 x <- peek curPtr' 84 accum = if T.null chunkText then B.append old remaining else remaining
102 case onErr desc (Just x) of 85 return $ case state of
103 Nothing -> loop $ curPtr' `plusPtr` 1 86 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
104 Just c -> do 87 _ -> Some chunkText accum (decodeChunk accum codepoint state)
105 destOff <- peek destOffPtr 88
106 w <- unsafeSTToIO $ 89
107 unsafeWrite dest (fromIntegral destOff) (safe c) 90mkText :: A.MArray s -> CSize -> IO Text
108 poke destOffPtr (destOff + fromIntegral w) 91mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
109 poke statePtr 0 92 return $! textP arr 0 (fromIntegral n)
110 loop $ curPtr' `plusPtr` 1
111 else do
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
119 _ -> do
120 -- We encountered the end of the buffer while decoding
121 n <- peek destOffPtr
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132 93
133ord :: Char -> Int 94ord :: Char -> Int
134ord (C# c#) = I# (ord# c#) 95ord (C# c#) = I# (ord# c#)
135{-# INLINE ord #-} 96{-# INLINE ord #-}
136 97
137
138unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int 98unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139unsafeWrite marr i c 99unsafeWrite marr i c
140 | n < 0x10000 = do 100 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
141 A.unsafeWrite marr i (fromIntegral n) 101 return 1
142 return 1 102 | otherwise = do A.unsafeWrite marr i lo
143 | otherwise = do 103 A.unsafeWrite marr (i+1) hi
144 A.unsafeWrite marr i lo 104 return 2
145 A.unsafeWrite marr (i+1) hi
146 return 2
147 where n = ord c 105 where n = ord c
148 m = n - 0x10000 106 m = n - 0x10000
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800 107 lo = fromIntegral $ (m `shiftR` 10) + 0xD800