aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text/Internal.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text/Internal.hs')
-rw-r--r--Pipes/Text/Internal.hs157
1 files changed, 157 insertions, 0 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
new file mode 100644
index 0000000..05d9887
--- /dev/null
+++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,157 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3-- This module lifts material from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8
6module Pipes.Text.Internal
7 ( Decoding(..)
8 , streamDecodeUtf8With
9 , streamDecodeUtf8
10 ) where
11
12import Control.Exception (evaluate, try)
13#if __GLASGOW_HASKELL__ >= 702
14import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15import Control.Monad.ST (ST, runST)
16#else
17import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
18#endif
19import Data.Bits ((.&.))
20import Data.ByteString as B
21import Data.ByteString.Internal as B
22import Data.Text ()
23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24import Data.Text.Internal (Text(..), safe, textP)
25import Data.Word (Word8, Word32)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base hiding (ord)
32import GHC.Word
33import qualified Data.Text.Array as A
34import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39#include "pipes_text_cbits.h"
40
41-- | A stream oriented decoding result.
42data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
48 showString " _"
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55
56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58
59-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60-- encoded text that is known to be valid.
61--
62-- If the input contains any invalid UTF-8 data, an exception will be
63-- thrown (either by this function or a continuation) that cannot be
64-- caught in pure code. For more control over the handling of invalid
65-- data, use 'streamDecodeUtf8With'.
66streamDecodeUtf8 :: ByteString -> Decoding
67streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
68
69-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
70-- encoded text.
71streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
72streamDecodeUtf8With mErr = case mErr of
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
75 where
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
79 where
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83 where
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len)
91 loop curPtr = do
92 poke curPtrPtr curPtr
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
94 curPtrPtr end codepointPtr statePtr
95 state <- peek statePtr
96 case state of
97 UTF8_REJECT ->
98 -- We encountered an encoding error
99 if replace
100 then do
101 x <- peek curPtr'
102 case onErr desc (Just x) of
103 Nothing -> loop $ curPtr' `plusPtr` 1
104 Just c -> do
105 destOff <- peek destOffPtr
106 w <- unsafeSTToIO $
107 unsafeWrite dest (fromIntegral destOff) (safe c)
108 poke destOffPtr (destOff + fromIntegral w)
109 poke statePtr 0
110 loop $ curPtr' `plusPtr` 1
111 else do
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
119 _ -> do
120 -- We encountered the end of the buffer while decoding
121 n <- peek destOffPtr
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132
133ord :: Char -> Int
134ord (C# c#) = I# (ord# c#)
135{-# INLINE ord #-}
136
137
138unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139unsafeWrite marr i c
140 | n < 0x10000 = do
141 A.unsafeWrite marr i (fromIntegral n)
142 return 1
143 | otherwise = do
144 A.unsafeWrite marr i lo
145 A.unsafeWrite marr (i+1) hi
146 return 2
147 where n = ord c
148 m = n - 0x10000
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
150 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
151 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
152{-# INLINE unsafeWrite #-}
153
154foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
155 :: MutableByteArray# s -> Ptr CSize
156 -> Ptr (Ptr Word8) -> Ptr Word8
157 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file