1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
3 -- This module lifts material from Brian O'Sullivan's text package
4 -- especially Data.Text.Encoding in order to define a pipes-appropriate
6 module Pipes.Text.Internal
12 import Control.Exception (evaluate, try)
13 #if __GLASGOW_HASKELL__ >= 702
14 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15 import Control.Monad.ST (ST, runST)
17 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
19 import Data.Bits ((.&.))
20 import Data.ByteString as B
21 import Data.ByteString.Internal as B
23 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24 import Data.Text.Internal (Text(..), safe, textP)
25 import Data.Word (Word8, Word32)
26 import Foreign.C.Types (CSize)
27 import Foreign.ForeignPtr (withForeignPtr)
28 import Foreign.Marshal.Utils (with)
29 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30 import Foreign.Storable (Storable, peek, poke)
31 import GHC.Base hiding (ord)
33 import qualified Data.Text.Array as A
34 import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35 import GHC.Word (Word8(..), Word16(..), Word32(..))
37 import Data.Text.Unsafe (unsafeDupablePerformIO)
39 #include "pipes_text_cbits.h"
41 -- | A stream oriented decoding result.
42 data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44 instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
54 where prec = 10; prec' = prec + 1
56 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
59 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60 -- encoded text that is known to be valid.
62 -- If the input contains any invalid UTF-8 data, an exception will be
63 -- thrown (either by this function or a continuation) that cannot be
64 -- caught in pure code. For more control over the handling of invalid
65 -- data, use 'streamDecodeUtf8With'.
66 streamDecodeUtf8 :: ByteString -> Decoding
67 streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
69 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
71 streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
72 streamDecodeUtf8With mErr = case mErr of
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len)
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
94 curPtrPtr end codepointPtr statePtr
95 state <- peek statePtr
98 -- We encountered an encoding error
102 case onErr desc (Just x) of
103 Nothing -> loop $ curPtr' `plusPtr` 1
105 destOff <- peek destOffPtr
107 unsafeWrite dest (fromIntegral destOff) (safe c)
108 poke destOffPtr (destOff + fromIntegral w)
110 loop $ curPtr' `plusPtr` 1
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
120 -- We encountered the end of the buffer while decoding
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
134 ord (C# c#) = I# (ord# c#)
138 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
141 A.unsafeWrite marr i (fromIntegral n)
144 A.unsafeWrite marr i lo
145 A.unsafeWrite marr (i+1) hi
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
150 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
151 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
152 {-# INLINE unsafeWrite #-}
154 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
155 :: MutableByteArray# s -> Ptr CSize
156 -> Ptr (Ptr Word8) -> Ptr Word8
157 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)