diff options
Diffstat (limited to 'Pipes/Text/Internal.hs')
-rw-r--r-- | Pipes/Text/Internal.hs | 152 |
1 files changed, 10 insertions, 142 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index bcee278..2530b23 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -1,147 +1,15 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} | 1 | module Pipes.Text.Internal |
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | ||
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
5 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
6 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
7 | -- streamDecodeUtf8 | ||
8 | module Pipes.Text.Internal | ||
9 | ( Decoding(..) | 2 | ( Decoding(..) |
10 | , streamDecodeUtf8 | 3 | , streamDecodeUtf8 |
11 | , decodeSomeUtf8 | 4 | , decodeSomeUtf8 |
5 | , Codec(..) | ||
6 | , TextException(..) | ||
7 | , utf8 | ||
8 | , utf16_le | ||
9 | , utf16_be | ||
10 | , utf32_le | ||
11 | , utf32_be | ||
12 | ) where | 12 | ) where |
13 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
14 | import Control.Monad.ST (ST, runST) | ||
15 | import Data.Bits ((.&.)) | ||
16 | import Data.ByteString as B | ||
17 | import Data.ByteString (ByteString) | ||
18 | import Data.ByteString.Internal as B | ||
19 | import Data.ByteString.Char8 as B8 | ||
20 | import Data.Text (Text) | ||
21 | import qualified Data.Text as T | ||
22 | import qualified Data.Text.Encoding as TE | ||
23 | import Data.Text.Encoding.Error () | ||
24 | import Data.Text.Internal (Text, textP) | ||
25 | import Foreign.C.Types (CSize) | ||
26 | import Foreign.ForeignPtr (withForeignPtr) | ||
27 | import Foreign.Marshal.Utils (with) | ||
28 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
29 | import Foreign.Storable (Storable, peek, poke) | ||
30 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
31 | import GHC.Word (Word8, Word32) | ||
32 | import qualified Data.Text.Array as A | ||
33 | import Data.Word (Word8, Word16) | ||
34 | import System.IO.Unsafe (unsafePerformIO) | ||
35 | import qualified Control.Exception as Exc | ||
36 | import Data.Bits ((.&.), (.|.), shiftL) | ||
37 | import Data.Typeable | ||
38 | import Control.Arrow (first) | ||
39 | import Data.Maybe (catMaybes) | ||
40 | #include "pipes_text_cbits.h" | ||
41 | |||
42 | |||
43 | |||
44 | -- | A stream oriented decoding result. | ||
45 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
46 | | Other Text ByteString | ||
47 | instance Show Decoding where | ||
48 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
49 | showString "Some " . showsPrec prec' t . | ||
50 | showChar ' ' . showsPrec prec' bs . | ||
51 | showString " _" | ||
52 | where prec = 10; prec' = prec + 1 | ||
53 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
54 | showString "Other " . showsPrec prec' t . | ||
55 | showChar ' ' . showsPrec prec' bs . | ||
56 | showString " _" | ||
57 | where prec = 10; prec' = prec + 1 | ||
58 | |||
59 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
60 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
61 | |||
62 | streamDecodeUtf8 :: ByteString -> Decoding | ||
63 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
64 | where | ||
65 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
66 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
67 | runST $ do marray <- A.new (len+1) | ||
68 | unsafeIOToST (decodeChunkToBuffer marray) | ||
69 | where | ||
70 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
71 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
72 | with (0::CSize) $ \destOffPtr -> | ||
73 | with codepoint0 $ \codepointPtr -> | ||
74 | with state0 $ \statePtr -> | ||
75 | with nullPtr $ \curPtrPtr -> | ||
76 | do let end = ptr `plusPtr` (off + len) | ||
77 | curPtr = ptr `plusPtr` off | ||
78 | poke curPtrPtr curPtr | ||
79 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
80 | state <- peek statePtr | ||
81 | lastPtr <- peek curPtrPtr | ||
82 | codepoint <- peek codepointPtr | ||
83 | n <- peek destOffPtr | ||
84 | chunkText <- mkText dest n | ||
85 | let left = lastPtr `minusPtr` curPtr | ||
86 | remaining = B.drop left bs | ||
87 | accum = if T.null chunkText then B.append old remaining else remaining | ||
88 | return $! case state of | ||
89 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
90 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
91 | {-# INLINE decodeChunkToBuffer #-} | ||
92 | {-# INLINE decodeChunkUtf8 #-} | ||
93 | {-# INLINE streamDecodeUtf8 #-} | ||
94 | |||
95 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
96 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
97 | dest <- A.new (len+1) | ||
98 | unsafeIOToST $ | ||
99 | withForeignPtr fp $ \ptr -> | ||
100 | with (0::CSize) $ \destOffPtr -> | ||
101 | with (0::CodePoint) $ \codepointPtr -> | ||
102 | with (0::DecoderState) $ \statePtr -> | ||
103 | with nullPtr $ \curPtrPtr -> | ||
104 | do let end = ptr `plusPtr` (off + len) | ||
105 | curPtr = ptr `plusPtr` off | ||
106 | poke curPtrPtr curPtr | ||
107 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
108 | curPtrPtr end codepointPtr statePtr | ||
109 | state <- peek statePtr | ||
110 | lastPtr <- peek curPtrPtr | ||
111 | codepoint <- peek codepointPtr | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
114 | return $! textP arr 0 (fromIntegral n) | ||
115 | let left = lastPtr `minusPtr` curPtr | ||
116 | remaining = B.drop left bs | ||
117 | return $! (chunkText, remaining) | ||
118 | {-# INLINE decodeSomeUtf8 #-} | ||
119 | |||
120 | mkText :: A.MArray s -> CSize -> IO Text | ||
121 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
122 | return $! textP arr 0 (fromIntegral n) | ||
123 | {-# INLINE mkText #-} | ||
124 | |||
125 | ord :: Char -> Int | ||
126 | ord (C# c#) = I# (ord# c#) | ||
127 | {-# INLINE ord #-} | ||
128 | |||
129 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
130 | unsafeWrite marr i c | ||
131 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
132 | return 1 | ||
133 | | otherwise = do A.unsafeWrite marr i lo | ||
134 | A.unsafeWrite marr (i+1) hi | ||
135 | return 2 | ||
136 | where n = ord c | ||
137 | m = n - 0x10000 | ||
138 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
139 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
140 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
141 | {-# INLINE shiftR #-} | ||
142 | {-# INLINE unsafeWrite #-} | ||
143 | 13 | ||
144 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | 14 | import Pipes.Text.Internal.Decoding |
145 | :: MutableByteArray# s -> Ptr CSize | 15 | import Pipes.Text.Internal.Codec \ No newline at end of file |
146 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
147 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||