1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
UnliftedFFITypes #-}
-- This module lifts material from Brian O'Sullivan's text package
-- especially Data.Text.Encoding in order to define a pipes-appropriate
-- streamDecodeUtf8
module Pipes.Text.Internal
( Decoding(..)
, streamDecodeUtf8With
, streamDecodeUtf8
) where
import Control.Exception (evaluate, try)
#if __GLASGOW_HASKELL__ >= 702
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Monad.ST (ST, runST)
#else
import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
#endif
import Data.Bits ((.&.))
import Data.ByteString as B
import Data.ByteString.Internal as B
import Data.Text ()
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, textP)
import Data.Word (Word8, Word32)
import Foreign.C.Types (CSize)
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
import GHC.Base hiding (ord)
import GHC.Word
import qualified Data.Text.Array as A
import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
import GHC.Word (Word8(..), Word16(..), Word32(..))
import Data.Text.Unsafe (unsafeDupablePerformIO)
#include "pipes_text_cbits.h"
-- | A stream oriented decoding result.
data Decoding = Some Text ByteString (ByteString -> Decoding)
| Other Text ByteString
instance Show Decoding where
showsPrec d (Some t bs _) = showParen (d > prec) $
showString "Some " . showsPrec prec' t .
showChar ' ' . showsPrec prec' bs .
showString " _"
where prec = 10; prec' = prec + 1
showsPrec d (Other t bs) = showParen (d > prec) $
showString "Other " . showsPrec prec' t .
showChar ' ' . showsPrec prec' bs .
showString " _"
where prec = 10; prec' = prec + 1
newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text that is known to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown (either by this function or a continuation) that cannot be
-- caught in pure code. For more control over the handling of invalid
-- data, use 'streamDecodeUtf8With'.
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text.
streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With mErr = case mErr of
Nothing -> decodeWith False strictDecode
Just onErr -> decodeWith True onErr
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential surrogate pair started in the last buffer
decodeWith replace onErr = decodeChunk 0 0
where
decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
decodeChunk codepoint0 state0 bs@(PS fp off len) =
runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->
with codepoint0 $ \codepointPtr ->
with state0 $ \statePtr ->
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` (off + len)
loop curPtr = do
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT ->
-- We encountered an encoding error
if replace
then do
x <- peek curPtr'
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
poke statePtr 0
loop $ curPtr' `plusPtr` 1
else do
n <- peek destOffPtr
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! textP arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
return $ Other chunkText (B.drop left bs)
_ -> do
-- We encountered the end of the buffer while decoding
n <- peek destOffPtr
codepoint <- peek codepointPtr
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! textP arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
return $ Some chunkText (B.drop left bs)
(decodeChunk codepoint state)
in loop (ptr `plusPtr` off)
desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
ord :: Char -> Int
ord (C# c#) = I# (ord# c#)
{-# INLINE ord #-}
unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
unsafeWrite marr i c
| n < 0x10000 = do
A.unsafeWrite marr i (fromIntegral n)
return 1
| otherwise = do
A.unsafeWrite marr i lo
A.unsafeWrite marr (i+1) hi
return 2
where n = ord c
m = n - 0x10000
lo = fromIntegral $ (m `shiftR` 10) + 0xD800
hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
{-# INLINE unsafeWrite #-}
foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
:: MutableByteArray# s -> Ptr CSize
-> Ptr (Ptr Word8) -> Ptr Word8
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
|