]>
Commit | Line | Data |
---|---|---|
409759e8 | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} |
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | |
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | |
4 | ||
4da91c37 | 5 | {- | |
6 | This module lifts assorted materials from Brian O'Sullivan's text package | |
7 | especially @Data.Text.Encoding@ in order to define a pipes-appropriate | |
8 | 'streamDecodeUtf8' | |
9 | -} | |
409759e8 | 10 | module Pipes.Text.Internal.Decoding |
11 | ( Decoding(..) | |
12 | , streamDecodeUtf8 | |
13 | , decodeSomeUtf8 | |
14 | ) where | |
15 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | |
16 | import Control.Monad.ST (ST, runST) | |
17 | import Data.Bits ((.&.)) | |
18 | import Data.ByteString as B | |
19 | import Data.ByteString (ByteString) | |
20 | import Data.ByteString.Internal as B | |
21 | import Data.ByteString.Char8 as B8 | |
22 | import Data.Text (Text) | |
23 | import qualified Data.Text as T | |
24 | import qualified Data.Text.Encoding as TE | |
25 | import Data.Text.Encoding.Error () | |
26 | import Data.Text.Internal (Text, textP) | |
27 | import Foreign.C.Types (CSize) | |
28 | import Foreign.ForeignPtr (withForeignPtr) | |
29 | import Foreign.Marshal.Utils (with) | |
30 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | |
31 | import Foreign.Storable (Storable, peek, poke) | |
32 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | |
33 | import GHC.Word (Word8, Word32) | |
34 | import qualified Data.Text.Array as A | |
35 | import Data.Word (Word8, Word16) | |
36 | import System.IO.Unsafe (unsafePerformIO) | |
37 | import qualified Control.Exception as Exc | |
38 | import Data.Bits ((.&.), (.|.), shiftL) | |
39 | import Data.Typeable | |
40 | import Control.Arrow (first) | |
41 | import Data.Maybe (catMaybes) | |
42 | #include "pipes_text_cbits.h" | |
43 | ||
44 | ||
45 | ||
4da91c37 | 46 | -- | A stream oriented decoding result. Distinct from the similar type in @Data.Text.Encoding@ |
47 | data Decoding = Some Text ByteString (ByteString -> Decoding) -- | Text, continuation and any undecoded fragment. | |
48 | | Other Text ByteString -- | Text followed by an undecodable ByteString | |
409759e8 | 49 | instance Show Decoding where |
50 | showsPrec d (Some t bs _) = showParen (d > prec) $ | |
51 | showString "Some " . showsPrec prec' t . | |
52 | showChar ' ' . showsPrec prec' bs . | |
53 | showString " _" | |
54 | where prec = 10; prec' = prec + 1 | |
55 | showsPrec d (Other t bs) = showParen (d > prec) $ | |
56 | showString "Other " . showsPrec prec' t . | |
57 | showChar ' ' . showsPrec prec' bs . | |
58 | showString " _" | |
59 | where prec = 10; prec' = prec + 1 | |
60 | ||
61 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | |
62 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | |
63 | ||
4da91c37 | 64 | -- | Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. |
409759e8 | 65 | streamDecodeUtf8 :: ByteString -> Decoding |
66 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | |
67 | where | |
68 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | |
69 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | |
70 | runST $ do marray <- A.new (len+1) | |
71 | unsafeIOToST (decodeChunkToBuffer marray) | |
72 | where | |
73 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | |
74 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | |
75 | with (0::CSize) $ \destOffPtr -> | |
76 | with codepoint0 $ \codepointPtr -> | |
77 | with state0 $ \statePtr -> | |
78 | with nullPtr $ \curPtrPtr -> | |
79 | do let end = ptr `plusPtr` (off + len) | |
80 | curPtr = ptr `plusPtr` off | |
81 | poke curPtrPtr curPtr | |
82 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | |
83 | state <- peek statePtr | |
84 | lastPtr <- peek curPtrPtr | |
85 | codepoint <- peek codepointPtr | |
86 | n <- peek destOffPtr | |
87 | chunkText <- mkText dest n | |
88 | let left = lastPtr `minusPtr` curPtr | |
89 | remaining = B.drop left bs | |
90 | accum = if T.null chunkText then B.append old remaining else remaining | |
91 | return $! case state of | |
92 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | |
93 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | |
94 | {-# INLINE decodeChunkToBuffer #-} | |
95 | {-# INLINE decodeChunkUtf8 #-} | |
96 | {-# INLINE streamDecodeUtf8 #-} | |
97 | ||
4da91c37 | 98 | -- | Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble |
409759e8 | 99 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) |
100 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | |
101 | dest <- A.new (len+1) | |
102 | unsafeIOToST $ | |
103 | withForeignPtr fp $ \ptr -> | |
104 | with (0::CSize) $ \destOffPtr -> | |
105 | with (0::CodePoint) $ \codepointPtr -> | |
106 | with (0::DecoderState) $ \statePtr -> | |
107 | with nullPtr $ \curPtrPtr -> | |
108 | do let end = ptr `plusPtr` (off + len) | |
109 | curPtr = ptr `plusPtr` off | |
110 | poke curPtrPtr curPtr | |
111 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | |
112 | curPtrPtr end codepointPtr statePtr | |
113 | state <- peek statePtr | |
114 | lastPtr <- peek curPtrPtr | |
115 | codepoint <- peek codepointPtr | |
116 | n <- peek destOffPtr | |
117 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | |
118 | return $! textP arr 0 (fromIntegral n) | |
119 | let left = lastPtr `minusPtr` curPtr | |
120 | remaining = B.drop left bs | |
121 | return $! (chunkText, remaining) | |
122 | {-# INLINE decodeSomeUtf8 #-} | |
123 | ||
124 | mkText :: A.MArray s -> CSize -> IO Text | |
125 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | |
126 | return $! textP arr 0 (fromIntegral n) | |
127 | {-# INLINE mkText #-} | |
128 | ||
129 | ord :: Char -> Int | |
130 | ord (C# c#) = I# (ord# c#) | |
131 | {-# INLINE ord #-} | |
132 | ||
133 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | |
134 | unsafeWrite marr i c | |
135 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | |
136 | return 1 | |
137 | | otherwise = do A.unsafeWrite marr i lo | |
138 | A.unsafeWrite marr (i+1) hi | |
139 | return 2 | |
140 | where n = ord c | |
141 | m = n - 0x10000 | |
142 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | |
143 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | |
144 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | |
145 | {-# INLINE shiftR #-} | |
146 | {-# INLINE unsafeWrite #-} | |
147 | ||
148 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | |
149 | :: MutableByteArray# s -> Ptr CSize | |
150 | -> Ptr (Ptr Word8) -> Ptr Word8 | |
151 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) |