diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-23 13:02:49 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-23 13:02:49 -0500 |
commit | 8c48280926efffc0ca52a5d9ca796d639d053379 (patch) | |
tree | 972ca8955b5581d634663424e973e56fa4487fe5 /Pipes/Text | |
parent | 8853a440e37523bae8cb46827d0d2d356bad5c46 (diff) | |
download | text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip |
variant using text internals in place of text streamDecodeUtf8
Diffstat (limited to 'Pipes/Text')
-rw-r--r-- | Pipes/Text/Internal.hs | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..05d9887 --- /dev/null +++ b/Pipes/Text/Internal.hs | |||
@@ -0,0 +1,157 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | ||
2 | UnliftedFFITypes #-} | ||
3 | -- This module lifts material from Brian O'Sullivan's text package | ||
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
5 | -- streamDecodeUtf8 | ||
6 | module Pipes.Text.Internal | ||
7 | ( Decoding(..) | ||
8 | , streamDecodeUtf8With | ||
9 | , streamDecodeUtf8 | ||
10 | ) where | ||
11 | |||
12 | import Control.Exception (evaluate, try) | ||
13 | #if __GLASGOW_HASKELL__ >= 702 | ||
14 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
15 | import Control.Monad.ST (ST, runST) | ||
16 | #else | ||
17 | import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) | ||
18 | #endif | ||
19 | import Data.Bits ((.&.)) | ||
20 | import Data.ByteString as B | ||
21 | import Data.ByteString.Internal as B | ||
22 | import Data.Text () | ||
23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) | ||
24 | import Data.Text.Internal (Text(..), safe, textP) | ||
25 | import Data.Word (Word8, Word32) | ||
26 | import Foreign.C.Types (CSize) | ||
27 | import Foreign.ForeignPtr (withForeignPtr) | ||
28 | import Foreign.Marshal.Utils (with) | ||
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
30 | import Foreign.Storable (Storable, peek, poke) | ||
31 | import GHC.Base hiding (ord) | ||
32 | import GHC.Word | ||
33 | import qualified Data.Text.Array as A | ||
34 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) | ||
35 | import GHC.Word (Word8(..), Word16(..), Word32(..)) | ||
36 | |||
37 | import Data.Text.Unsafe (unsafeDupablePerformIO) | ||
38 | |||
39 | #include "pipes_text_cbits.h" | ||
40 | |||
41 | -- | A stream oriented decoding result. | ||
42 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
43 | | Other Text ByteString | ||
44 | instance Show Decoding where | ||
45 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
46 | showString "Some " . showsPrec prec' t . | ||
47 | showChar ' ' . showsPrec prec' bs . | ||
48 | showString " _" | ||
49 | where prec = 10; prec' = prec + 1 | ||
50 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
51 | showString "Other " . showsPrec prec' t . | ||
52 | showChar ' ' . showsPrec prec' bs . | ||
53 | showString " _" | ||
54 | where prec = 10; prec' = prec + 1 | ||
55 | |||
56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
58 | |||
59 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
60 | -- encoded text that is known to be valid. | ||
61 | -- | ||
62 | -- If the input contains any invalid UTF-8 data, an exception will be | ||
63 | -- thrown (either by this function or a continuation) that cannot be | ||
64 | -- caught in pure code. For more control over the handling of invalid | ||
65 | -- data, use 'streamDecodeUtf8With'. | ||
66 | streamDecodeUtf8 :: ByteString -> Decoding | ||
67 | streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) | ||
68 | |||
69 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
70 | -- encoded text. | ||
71 | streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding | ||
72 | streamDecodeUtf8With mErr = case mErr of | ||
73 | Nothing -> decodeWith False strictDecode | ||
74 | Just onErr -> decodeWith True onErr | ||
75 | where | ||
76 | -- We create a slightly larger than necessary buffer to accommodate a | ||
77 | -- potential surrogate pair started in the last buffer | ||
78 | decodeWith replace onErr = decodeChunk 0 0 | ||
79 | where | ||
80 | decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding | ||
81 | decodeChunk codepoint0 state0 bs@(PS fp off len) = | ||
82 | runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) | ||
83 | where | ||
84 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
85 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
86 | with (0::CSize) $ \destOffPtr -> | ||
87 | with codepoint0 $ \codepointPtr -> | ||
88 | with state0 $ \statePtr -> | ||
89 | with nullPtr $ \curPtrPtr -> | ||
90 | let end = ptr `plusPtr` (off + len) | ||
91 | loop curPtr = do | ||
92 | poke curPtrPtr curPtr | ||
93 | curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
94 | curPtrPtr end codepointPtr statePtr | ||
95 | state <- peek statePtr | ||
96 | case state of | ||
97 | UTF8_REJECT -> | ||
98 | -- We encountered an encoding error | ||
99 | if replace | ||
100 | then do | ||
101 | x <- peek curPtr' | ||
102 | case onErr desc (Just x) of | ||
103 | Nothing -> loop $ curPtr' `plusPtr` 1 | ||
104 | Just c -> do | ||
105 | destOff <- peek destOffPtr | ||
106 | w <- unsafeSTToIO $ | ||
107 | unsafeWrite dest (fromIntegral destOff) (safe c) | ||
108 | poke destOffPtr (destOff + fromIntegral w) | ||
109 | poke statePtr 0 | ||
110 | loop $ curPtr' `plusPtr` 1 | ||
111 | else do | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do | ||
114 | arr <- A.unsafeFreeze dest | ||
115 | return $! textP arr 0 (fromIntegral n) | ||
116 | lastPtr <- peek curPtrPtr | ||
117 | let left = lastPtr `minusPtr` curPtr | ||
118 | return $ Other chunkText (B.drop left bs) | ||
119 | _ -> do | ||
120 | -- We encountered the end of the buffer while decoding | ||
121 | n <- peek destOffPtr | ||
122 | codepoint <- peek codepointPtr | ||
123 | chunkText <- unsafeSTToIO $ do | ||
124 | arr <- A.unsafeFreeze dest | ||
125 | return $! textP arr 0 (fromIntegral n) | ||
126 | lastPtr <- peek curPtrPtr | ||
127 | let left = lastPtr `minusPtr` curPtr | ||
128 | return $ Some chunkText (B.drop left bs) | ||
129 | (decodeChunk codepoint state) | ||
130 | in loop (ptr `plusPtr` off) | ||
131 | desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" | ||
132 | |||
133 | ord :: Char -> Int | ||
134 | ord (C# c#) = I# (ord# c#) | ||
135 | {-# INLINE ord #-} | ||
136 | |||
137 | |||
138 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
139 | unsafeWrite marr i c | ||
140 | | n < 0x10000 = do | ||
141 | A.unsafeWrite marr i (fromIntegral n) | ||
142 | return 1 | ||
143 | | otherwise = do | ||
144 | A.unsafeWrite marr i lo | ||
145 | A.unsafeWrite marr (i+1) hi | ||
146 | return 2 | ||
147 | where n = ord c | ||
148 | m = n - 0x10000 | ||
149 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
150 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
151 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
152 | {-# INLINE unsafeWrite #-} | ||
153 | |||
154 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
155 | :: MutableByteArray# s -> Ptr CSize | ||
156 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
157 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||