1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2 {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3 {-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
5 -- This module lifts assorted materials from Brian O'Sullivan's text package
6 -- especially Data.Text.Encoding in order to define a pipes-appropriate
8 module Pipes.Text.Internal
13 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14 import Control.Monad.ST (ST, runST)
15 import Data.Bits ((.&.))
16 import Data.ByteString as B
17 import Data.ByteString (ByteString)
18 import Data.ByteString.Internal as B
19 import Data.ByteString.Char8 as B8
20 import Data.Text (Text)
21 import qualified Data.Text as T
22 import qualified Data.Text.Encoding as TE
23 import Data.Text.Encoding.Error ()
24 import Data.Text.Internal (Text, textP)
25 import Foreign.C.Types (CSize)
26 import Foreign.ForeignPtr (withForeignPtr)
27 import Foreign.Marshal.Utils (with)
28 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
29 import Foreign.Storable (Storable, peek, poke)
30 import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
31 import GHC.Word (Word8, Word32)
32 import qualified Data.Text.Array as A
33 import Data.Word (Word8, Word16)
34 import System.IO.Unsafe (unsafePerformIO)
35 import qualified Control.Exception as Exc
36 import Data.Bits ((.&.), (.|.), shiftL)
38 import Control.Arrow (first)
39 import Data.Maybe (catMaybes)
40 #include "pipes_text_cbits.h"
44 -- | A stream oriented decoding result.
45 data Decoding = Some Text ByteString (ByteString -> Decoding)
46 | Other Text ByteString
47 instance Show Decoding where
48 showsPrec d (Some t bs _) = showParen (d > prec) $
49 showString "Some " . showsPrec prec' t .
50 showChar ' ' . showsPrec prec' bs .
52 where prec = 10; prec' = prec + 1
53 showsPrec d (Other t bs) = showParen (d > prec) $
54 showString "Other " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
57 where prec = 10; prec' = prec + 1
59 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
60 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
62 streamDecodeUtf8 :: ByteString -> Decoding
63 streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
65 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
66 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
67 runST $ do marray <- A.new (len+1)
68 unsafeIOToST (decodeChunkToBuffer marray)
70 decodeChunkToBuffer :: A.MArray s -> IO Decoding
71 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
72 with (0::CSize) $ \destOffPtr ->
73 with codepoint0 $ \codepointPtr ->
74 with state0 $ \statePtr ->
75 with nullPtr $ \curPtrPtr ->
76 do let end = ptr `plusPtr` (off + len)
77 curPtr = ptr `plusPtr` off
79 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
80 state <- peek statePtr
81 lastPtr <- peek curPtrPtr
82 codepoint <- peek codepointPtr
84 chunkText <- mkText dest n
85 let left = lastPtr `minusPtr` curPtr
86 remaining = B.drop left bs
87 accum = if T.null chunkText then B.append old remaining else remaining
88 return $! case state of
89 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
90 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
91 {-# INLINE decodeChunkToBuffer #-}
92 {-# INLINE decodeChunkUtf8 #-}
93 {-# INLINE streamDecodeUtf8 #-}
95 decodeSomeUtf8 :: ByteString -> (Text, ByteString)
96 decodeSomeUtf8 bs@(PS fp off len) = runST $ do
99 withForeignPtr fp $ \ptr ->
100 with (0::CSize) $ \destOffPtr ->
101 with (0::CodePoint) $ \codepointPtr ->
102 with (0::DecoderState) $ \statePtr ->
103 with nullPtr $ \curPtrPtr ->
104 do let end = ptr `plusPtr` (off + len)
105 curPtr = ptr `plusPtr` off
106 poke curPtrPtr curPtr
107 c_decode_utf8_with_state (A.maBA dest) destOffPtr
108 curPtrPtr end codepointPtr statePtr
109 state <- peek statePtr
110 lastPtr <- peek curPtrPtr
111 codepoint <- peek codepointPtr
113 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
114 return $! textP arr 0 (fromIntegral n)
115 let left = lastPtr `minusPtr` curPtr
116 remaining = B.drop left bs
117 return $! (chunkText, remaining)
118 {-# INLINE decodeSomeUtf8 #-}
120 mkText :: A.MArray s -> CSize -> IO Text
121 mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
122 return $! textP arr 0 (fromIntegral n)
123 {-# INLINE mkText #-}
126 ord (C# c#) = I# (ord# c#)
129 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
131 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
133 | otherwise = do A.unsafeWrite marr i lo
134 A.unsafeWrite marr (i+1) hi
138 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
139 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
140 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
141 {-# INLINE shiftR #-}
142 {-# INLINE unsafeWrite #-}
144 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
145 :: MutableByteArray# s -> Ptr CSize
146 -> Ptr (Ptr Word8) -> Ptr Word8
147 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)