]>
Commit | Line | Data |
---|---|---|
409759e8 | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} |
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | |
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | |
4 | ||
b23136b1 | 5 | -- This module lifts assorted materials from Brian O'Sullivan's text package |
6 | -- especially @Data.Text.Encoding@ in order to define a pipes-appropriate | |
7 | -- 'streamDecodeUtf8' | |
8 | ||
409759e8 | 9 | module Pipes.Text.Internal.Decoding |
10 | ( Decoding(..) | |
11 | , streamDecodeUtf8 | |
12 | , decodeSomeUtf8 | |
13 | ) where | |
14 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | |
15 | import Control.Monad.ST (ST, runST) | |
16 | import Data.Bits ((.&.)) | |
17 | import Data.ByteString as B | |
18 | import Data.ByteString (ByteString) | |
19 | import Data.ByteString.Internal as B | |
20 | import Data.ByteString.Char8 as B8 | |
21 | import Data.Text (Text) | |
22 | import qualified Data.Text as T | |
23 | import qualified Data.Text.Encoding as TE | |
24 | import Data.Text.Encoding.Error () | |
25 | import Data.Text.Internal (Text, textP) | |
26 | import Foreign.C.Types (CSize) | |
27 | import Foreign.ForeignPtr (withForeignPtr) | |
28 | import Foreign.Marshal.Utils (with) | |
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | |
30 | import Foreign.Storable (Storable, peek, poke) | |
31 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | |
32 | import GHC.Word (Word8, Word32) | |
33 | import qualified Data.Text.Array as A | |
34 | import Data.Word (Word8, Word16) | |
35 | import System.IO.Unsafe (unsafePerformIO) | |
36 | import qualified Control.Exception as Exc | |
37 | import Data.Bits ((.&.), (.|.), shiftL) | |
38 | import Data.Typeable | |
39 | import Control.Arrow (first) | |
40 | import Data.Maybe (catMaybes) | |
41 | #include "pipes_text_cbits.h" | |
42 | ||
43 | ||
44 | ||
b23136b1 | 45 | -- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding |
46 | ||
47 | data Decoding = Some Text ByteString (ByteString -> Decoding) | |
48 | -- Text, continuation and any undecoded fragment. | |
49 | | Other Text ByteString | |
50 | -- Text followed by an undecodable ByteString | |
51 | ||
409759e8 | 52 | instance Show Decoding where |
53 | showsPrec d (Some t bs _) = showParen (d > prec) $ | |
54 | showString "Some " . showsPrec prec' t . | |
55 | showChar ' ' . showsPrec prec' bs . | |
56 | showString " _" | |
57 | where prec = 10; prec' = prec + 1 | |
58 | showsPrec d (Other t bs) = showParen (d > prec) $ | |
59 | showString "Other " . showsPrec prec' t . | |
60 | showChar ' ' . showsPrec prec' bs . | |
61 | showString " _" | |
62 | where prec = 10; prec' = prec + 1 | |
63 | ||
64 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | |
65 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | |
66 | ||
b23136b1 | 67 | -- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. |
409759e8 | 68 | streamDecodeUtf8 :: ByteString -> Decoding |
69 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | |
70 | where | |
71 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | |
72 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | |
73 | runST $ do marray <- A.new (len+1) | |
74 | unsafeIOToST (decodeChunkToBuffer marray) | |
75 | where | |
76 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | |
77 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | |
78 | with (0::CSize) $ \destOffPtr -> | |
79 | with codepoint0 $ \codepointPtr -> | |
80 | with state0 $ \statePtr -> | |
81 | with nullPtr $ \curPtrPtr -> | |
82 | do let end = ptr `plusPtr` (off + len) | |
83 | curPtr = ptr `plusPtr` off | |
84 | poke curPtrPtr curPtr | |
85 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | |
86 | state <- peek statePtr | |
87 | lastPtr <- peek curPtrPtr | |
88 | codepoint <- peek codepointPtr | |
89 | n <- peek destOffPtr | |
90 | chunkText <- mkText dest n | |
91 | let left = lastPtr `minusPtr` curPtr | |
92 | remaining = B.drop left bs | |
93 | accum = if T.null chunkText then B.append old remaining else remaining | |
94 | return $! case state of | |
95 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | |
96 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | |
97 | {-# INLINE decodeChunkToBuffer #-} | |
98 | {-# INLINE decodeChunkUtf8 #-} | |
99 | {-# INLINE streamDecodeUtf8 #-} | |
100 | ||
b23136b1 | 101 | -- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble |
409759e8 | 102 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) |
103 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | |
104 | dest <- A.new (len+1) | |
105 | unsafeIOToST $ | |
106 | withForeignPtr fp $ \ptr -> | |
107 | with (0::CSize) $ \destOffPtr -> | |
108 | with (0::CodePoint) $ \codepointPtr -> | |
109 | with (0::DecoderState) $ \statePtr -> | |
110 | with nullPtr $ \curPtrPtr -> | |
111 | do let end = ptr `plusPtr` (off + len) | |
112 | curPtr = ptr `plusPtr` off | |
113 | poke curPtrPtr curPtr | |
114 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | |
115 | curPtrPtr end codepointPtr statePtr | |
116 | state <- peek statePtr | |
117 | lastPtr <- peek curPtrPtr | |
118 | codepoint <- peek codepointPtr | |
119 | n <- peek destOffPtr | |
120 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | |
121 | return $! textP arr 0 (fromIntegral n) | |
122 | let left = lastPtr `minusPtr` curPtr | |
123 | remaining = B.drop left bs | |
124 | return $! (chunkText, remaining) | |
125 | {-# INLINE decodeSomeUtf8 #-} | |
126 | ||
127 | mkText :: A.MArray s -> CSize -> IO Text | |
128 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | |
129 | return $! textP arr 0 (fromIntegral n) | |
130 | {-# INLINE mkText #-} | |
131 | ||
132 | ord :: Char -> Int | |
133 | ord (C# c#) = I# (ord# c#) | |
134 | {-# INLINE ord #-} | |
135 | ||
136 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | |
137 | unsafeWrite marr i c | |
138 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | |
139 | return 1 | |
140 | | otherwise = do A.unsafeWrite marr i lo | |
141 | A.unsafeWrite marr (i+1) hi | |
142 | return 2 | |
143 | where n = ord c | |
144 | m = n - 0x10000 | |
145 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | |
146 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | |
147 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | |
148 | {-# INLINE shiftR #-} | |
149 | {-# INLINE unsafeWrite #-} | |
150 | ||
151 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | |
152 | :: MutableByteArray# s -> Ptr CSize | |
153 | -> Ptr (Ptr Word8) -> Ptr Word8 | |
154 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) |