diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:17:25 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:17:25 -0500 |
commit | 7381c94f47c76833972565ee8d15d86216b214ce (patch) | |
tree | 38ddadda59a3808422fc432d37b886c456adcb1d /Pipes/Text | |
parent | ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff) | |
parent | 3694350ac7b9c42fd64e0092a74cf0471a080058 (diff) | |
download | text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip |
merge home made decodeUtf8
Diffstat (limited to 'Pipes/Text')
-rw-r--r-- | Pipes/Text/Internal.hs | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..7e5b044 --- /dev/null +++ b/Pipes/Text/Internal.hs | |||
@@ -0,0 +1,163 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | ||
2 | UnliftedFFITypes #-} | ||
3 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
5 | -- streamDecodeUtf8 | ||
6 | module Pipes.Text.Internal | ||
7 | ( Decoding(..) | ||
8 | , streamDecodeUtf8 | ||
9 | , decodeSomeUtf8 | ||
10 | ) where | ||
11 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
12 | import Control.Monad.ST (ST, runST) | ||
13 | import Data.Bits ((.&.)) | ||
14 | import Data.ByteString as B | ||
15 | import Data.ByteString.Internal as B | ||
16 | import qualified Data.Text as T (null) | ||
17 | import Data.Text.Encoding.Error () | ||
18 | import Data.Text.Internal (Text, textP) | ||
19 | import Foreign.C.Types (CSize) | ||
20 | import Foreign.ForeignPtr (withForeignPtr) | ||
21 | import Foreign.Marshal.Utils (with) | ||
22 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
23 | import Foreign.Storable (Storable, peek, poke) | ||
24 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
25 | import GHC.Word (Word8, Word32) | ||
26 | import qualified Data.Text.Array as A | ||
27 | |||
28 | #include "pipes_text_cbits.h" | ||
29 | |||
30 | -- | A stream oriented decoding result. | ||
31 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
32 | | Other Text ByteString | ||
33 | instance Show Decoding where | ||
34 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
35 | showString "Some " . showsPrec prec' t . | ||
36 | showChar ' ' . showsPrec prec' bs . | ||
37 | showString " _" | ||
38 | where prec = 10; prec' = prec + 1 | ||
39 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
40 | showString "Other " . showsPrec prec' t . | ||
41 | showChar ' ' . showsPrec prec' bs . | ||
42 | showString " _" | ||
43 | where prec = 10; prec' = prec + 1 | ||
44 | |||
45 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
46 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
47 | |||
48 | streamDecodeUtf8 :: ByteString -> Decoding | ||
49 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
50 | where | ||
51 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
52 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
53 | runST $ do marray <- A.new (len+1) | ||
54 | unsafeIOToST (decodeChunkToBuffer marray) | ||
55 | where | ||
56 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
57 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
58 | with (0::CSize) $ \destOffPtr -> | ||
59 | with codepoint0 $ \codepointPtr -> | ||
60 | with state0 $ \statePtr -> | ||
61 | with nullPtr $ \curPtrPtr -> | ||
62 | do let end = ptr `plusPtr` (off + len) | ||
63 | curPtr = ptr `plusPtr` off | ||
64 | poke curPtrPtr curPtr | ||
65 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
66 | state <- peek statePtr | ||
67 | lastPtr <- peek curPtrPtr | ||
68 | codepoint <- peek codepointPtr | ||
69 | n <- peek destOffPtr | ||
70 | chunkText <- mkText dest n | ||
71 | let left = lastPtr `minusPtr` curPtr | ||
72 | remaining = B.drop left bs | ||
73 | accum = if T.null chunkText then B.append old remaining else remaining | ||
74 | return $! case state of | ||
75 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
76 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
77 | {-# INLINE decodeChunkToBuffer #-} | ||
78 | {-# INLINE decodeChunkUtf8 #-} | ||
79 | {-# INLINE streamDecodeUtf8 #-} | ||
80 | |||
81 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
82 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
83 | dest <- A.new (len+1) | ||
84 | unsafeIOToST $ | ||
85 | withForeignPtr fp $ \ptr -> | ||
86 | with (0::CSize) $ \destOffPtr -> | ||
87 | with (0::CodePoint) $ \codepointPtr -> | ||
88 | with (0::DecoderState) $ \statePtr -> | ||
89 | with nullPtr $ \curPtrPtr -> | ||
90 | do let end = ptr `plusPtr` (off + len) | ||
91 | curPtr = ptr `plusPtr` off | ||
92 | poke curPtrPtr curPtr | ||
93 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
94 | curPtrPtr end codepointPtr statePtr | ||
95 | state <- peek statePtr | ||
96 | lastPtr <- peek curPtrPtr | ||
97 | codepoint <- peek codepointPtr | ||
98 | n <- peek destOffPtr | ||
99 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
100 | return $! textP arr 0 (fromIntegral n) | ||
101 | let left = lastPtr `minusPtr` curPtr | ||
102 | remaining = B.drop left bs | ||
103 | return $! (chunkText, remaining) | ||
104 | {-# INLINE decodeSomeUtf8 #-} | ||
105 | |||
106 | -- decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
107 | -- decodeSomeUtf8 bs@(PS fp off len) = | ||
108 | -- runST $ do marray <- A.new (len+1) | ||
109 | -- unsafeIOToST (decodeChunkToBuffer marray) | ||
110 | -- | ||
111 | -- where | ||
112 | -- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) | ||
113 | -- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
114 | -- with (0::CSize) $ \destOffPtr -> | ||
115 | -- with (0::CodePoint) $ \codepointPtr -> | ||
116 | -- with (0::DecoderState) $ \statePtr -> | ||
117 | -- with nullPtr $ \curPtrPtr -> | ||
118 | -- do let end = ptr `plusPtr` (off + len) | ||
119 | -- curPtr = ptr `plusPtr` off | ||
120 | -- poke curPtrPtr curPtr | ||
121 | -- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
122 | -- state <- peek statePtr | ||
123 | -- lastPtr <- peek curPtrPtr | ||
124 | -- codepoint <- peek codepointPtr | ||
125 | -- n <- peek destOffPtr | ||
126 | -- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
127 | -- return $! textP arr 0 (fromIntegral n) | ||
128 | -- let left = lastPtr `minusPtr` curPtr | ||
129 | -- remaining = B.drop left bs | ||
130 | -- return $! (chunkText, remaining) | ||
131 | -- {-# INLINE decodeChunkToBuffer #-} | ||
132 | -- {-# INLINE decodeSomeUtf8 #-} | ||
133 | |||
134 | |||
135 | |||
136 | mkText :: A.MArray s -> CSize -> IO Text | ||
137 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
138 | return $! textP arr 0 (fromIntegral n) | ||
139 | {-# INLINE mkText #-} | ||
140 | |||
141 | ord :: Char -> Int | ||
142 | ord (C# c#) = I# (ord# c#) | ||
143 | {-# INLINE ord #-} | ||
144 | |||
145 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
146 | unsafeWrite marr i c | ||
147 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
148 | return 1 | ||
149 | | otherwise = do A.unsafeWrite marr i lo | ||
150 | A.unsafeWrite marr (i+1) hi | ||
151 | return 2 | ||
152 | where n = ord c | ||
153 | m = n - 0x10000 | ||
154 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
155 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
156 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
157 | {-# INLINE shiftR #-} | ||
158 | {-# INLINE unsafeWrite #-} | ||
159 | |||
160 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
161 | :: MutableByteArray# s -> Ptr CSize | ||
162 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
163 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||