]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Internal.hs
05d9887c0459e2affc0b1cd34c43eef3ff7537f4
[github/fretlink/text-pipes.git] / Pipes / Text / Internal.hs
1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3 -- This module lifts material from Brian O'Sullivan's text package
4 -- especially Data.Text.Encoding in order to define a pipes-appropriate
5 -- streamDecodeUtf8
6 module Pipes.Text.Internal
7 ( Decoding(..)
8 , streamDecodeUtf8With
9 , streamDecodeUtf8
10 ) where
11
12 import Control.Exception (evaluate, try)
13 #if __GLASGOW_HASKELL__ >= 702
14 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15 import Control.Monad.ST (ST, runST)
16 #else
17 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
18 #endif
19 import Data.Bits ((.&.))
20 import Data.ByteString as B
21 import Data.ByteString.Internal as B
22 import Data.Text ()
23 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24 import Data.Text.Internal (Text(..), safe, textP)
25 import Data.Word (Word8, Word32)
26 import Foreign.C.Types (CSize)
27 import Foreign.ForeignPtr (withForeignPtr)
28 import Foreign.Marshal.Utils (with)
29 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30 import Foreign.Storable (Storable, peek, poke)
31 import GHC.Base hiding (ord)
32 import GHC.Word
33 import qualified Data.Text.Array as A
34 import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35 import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37 import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39 #include "pipes_text_cbits.h"
40
41 -- | A stream oriented decoding result.
42 data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44 instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
48 showString " _"
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55
56 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58
59 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60 -- encoded text that is known to be valid.
61 --
62 -- If the input contains any invalid UTF-8 data, an exception will be
63 -- thrown (either by this function or a continuation) that cannot be
64 -- caught in pure code. For more control over the handling of invalid
65 -- data, use 'streamDecodeUtf8With'.
66 streamDecodeUtf8 :: ByteString -> Decoding
67 streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
68
69 -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
70 -- encoded text.
71 streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
72 streamDecodeUtf8With mErr = case mErr of
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
75 where
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
79 where
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83 where
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len)
91 loop curPtr = do
92 poke curPtrPtr curPtr
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
94 curPtrPtr end codepointPtr statePtr
95 state <- peek statePtr
96 case state of
97 UTF8_REJECT ->
98 -- We encountered an encoding error
99 if replace
100 then do
101 x <- peek curPtr'
102 case onErr desc (Just x) of
103 Nothing -> loop $ curPtr' `plusPtr` 1
104 Just c -> do
105 destOff <- peek destOffPtr
106 w <- unsafeSTToIO $
107 unsafeWrite dest (fromIntegral destOff) (safe c)
108 poke destOffPtr (destOff + fromIntegral w)
109 poke statePtr 0
110 loop $ curPtr' `plusPtr` 1
111 else do
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
119 _ -> do
120 -- We encountered the end of the buffer while decoding
121 n <- peek destOffPtr
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132
133 ord :: Char -> Int
134 ord (C# c#) = I# (ord# c#)
135 {-# INLINE ord #-}
136
137
138 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139 unsafeWrite marr i c
140 | n < 0x10000 = do
141 A.unsafeWrite marr i (fromIntegral n)
142 return 1
143 | otherwise = do
144 A.unsafeWrite marr i lo
145 A.unsafeWrite marr (i+1) hi
146 return 2
147 where n = ord c
148 m = n - 0x10000
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
150 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
151 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
152 {-# INLINE unsafeWrite #-}
153
154 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
155 :: MutableByteArray# s -> Ptr CSize
156 -> Ptr (Ptr Word8) -> Ptr Word8
157 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)