]>
Commit | Line | Data |
---|---|---|
8c482809 | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, |
2 | UnliftedFFITypes #-} | |
c9d1c945 | 3 | -- This module lifts assorted materials from Brian O'Sullivan's text package |
8c482809 | 4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate |
5 | -- streamDecodeUtf8 | |
6 | module Pipes.Text.Internal | |
7 | ( Decoding(..) | |
8c482809 | 8 | , streamDecodeUtf8 |
9 | ) where | |
10 | ||
11 | import Control.Exception (evaluate, try) | |
12 | #if __GLASGOW_HASKELL__ >= 702 | |
13 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | |
14 | import Control.Monad.ST (ST, runST) | |
15 | #else | |
16 | import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) | |
17 | #endif | |
18 | import Data.Bits ((.&.)) | |
19 | import Data.ByteString as B | |
20 | import Data.ByteString.Internal as B | |
21 | import Data.Text () | |
c9d1c945 | 22 | import qualified Data.Text as T |
8c482809 | 23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) |
24 | import Data.Text.Internal (Text(..), safe, textP) | |
25 | import Data.Word (Word8, Word32) | |
26 | import Foreign.C.Types (CSize) | |
27 | import Foreign.ForeignPtr (withForeignPtr) | |
28 | import Foreign.Marshal.Utils (with) | |
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | |
30 | import Foreign.Storable (Storable, peek, poke) | |
31 | import GHC.Base hiding (ord) | |
32 | import GHC.Word | |
33 | import qualified Data.Text.Array as A | |
34 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) | |
35 | import GHC.Word (Word8(..), Word16(..), Word32(..)) | |
36 | ||
37 | import Data.Text.Unsafe (unsafeDupablePerformIO) | |
38 | ||
39 | #include "pipes_text_cbits.h" | |
40 | ||
41 | -- | A stream oriented decoding result. | |
42 | data Decoding = Some Text ByteString (ByteString -> Decoding) | |
43 | | Other Text ByteString | |
44 | instance Show Decoding where | |
45 | showsPrec d (Some t bs _) = showParen (d > prec) $ | |
46 | showString "Some " . showsPrec prec' t . | |
47 | showChar ' ' . showsPrec prec' bs . | |
48 | showString " _" | |
49 | where prec = 10; prec' = prec + 1 | |
50 | showsPrec d (Other t bs) = showParen (d > prec) $ | |
51 | showString "Other " . showsPrec prec' t . | |
52 | showChar ' ' . showsPrec prec' bs . | |
53 | showString " _" | |
54 | where prec = 10; prec' = prec + 1 | |
55 | ||
56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | |
57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | |
58 | ||
8c482809 | 59 | streamDecodeUtf8 :: ByteString -> Decoding |
c9d1c945 | 60 | streamDecodeUtf8 = decodeChunk B.empty 0 0 |
8c482809 | 61 | |
c9d1c945 | 62 | decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding |
63 | decodeChunk old codepoint0 state0 bs@(PS fp off len) = | |
64 | runST $ do marray <- A.new (len+1) | |
65 | unsafeIOToST (decodeChunkToBuffer marray) | |
8c482809 | 66 | where |
c9d1c945 | 67 | decodeChunkToBuffer :: A.MArray s -> IO Decoding |
68 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | |
69 | with (0::CSize) $ \destOffPtr -> | |
70 | with codepoint0 $ \codepointPtr -> | |
71 | with state0 $ \statePtr -> | |
72 | with nullPtr $ \curPtrPtr -> | |
73 | do let end = ptr `plusPtr` (off + len) | |
74 | curPtr = ptr `plusPtr` off | |
75 | poke curPtrPtr curPtr | |
76 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | |
77 | state <- peek statePtr | |
78 | lastPtr <- peek curPtrPtr | |
79 | codepoint <- peek codepointPtr | |
80 | n <- peek destOffPtr | |
81 | chunkText <- mkText dest n | |
82 | let left = lastPtr `minusPtr` curPtr | |
83 | remaining = B.drop left bs | |
84 | accum = if T.null chunkText then B.append old remaining else remaining | |
85 | return $ case state of | |
86 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | |
87 | _ -> Some chunkText accum (decodeChunk accum codepoint state) | |
88 | ||
89 | ||
90 | mkText :: A.MArray s -> CSize -> IO Text | |
91 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | |
92 | return $! textP arr 0 (fromIntegral n) | |
8c482809 | 93 | |
94 | ord :: Char -> Int | |
95 | ord (C# c#) = I# (ord# c#) | |
96 | {-# INLINE ord #-} | |
97 | ||
8c482809 | 98 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int |
99 | unsafeWrite marr i c | |
c9d1c945 | 100 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) |
101 | return 1 | |
102 | | otherwise = do A.unsafeWrite marr i lo | |
103 | A.unsafeWrite marr (i+1) hi | |
104 | return 2 | |
8c482809 | 105 | where n = ord c |
106 | m = n - 0x10000 | |
107 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | |
108 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | |
109 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | |
110 | {-# INLINE unsafeWrite #-} | |
111 | ||
112 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | |
113 | :: MutableByteArray# s -> Ptr CSize | |
114 | -> Ptr (Ptr Word8) -> Ptr Word8 | |
115 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) |