]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Internal.hs
scrap character replacement; simplify
[github/fretlink/text-pipes.git] / Pipes / Text / Internal.hs
1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3 -- This module lifts assorted materials from Brian O'Sullivan's text package
4 -- especially Data.Text.Encoding in order to define a pipes-appropriate
5 -- streamDecodeUtf8
6 module Pipes.Text.Internal
7 ( Decoding(..)
8 , streamDecodeUtf8
9 ) where
10
11 import Control.Exception (evaluate, try)
12 #if __GLASGOW_HASKELL__ >= 702
13 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14 import Control.Monad.ST (ST, runST)
15 #else
16 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
17 #endif
18 import Data.Bits ((.&.))
19 import Data.ByteString as B
20 import Data.ByteString.Internal as B
21 import Data.Text ()
22 import qualified Data.Text as T
23 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24 import Data.Text.Internal (Text(..), safe, textP)
25 import Data.Word (Word8, Word32)
26 import Foreign.C.Types (CSize)
27 import Foreign.ForeignPtr (withForeignPtr)
28 import Foreign.Marshal.Utils (with)
29 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30 import Foreign.Storable (Storable, peek, poke)
31 import GHC.Base hiding (ord)
32 import GHC.Word
33 import qualified Data.Text.Array as A
34 import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35 import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37 import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39 #include "pipes_text_cbits.h"
40
41 -- | A stream oriented decoding result.
42 data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44 instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
48 showString " _"
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55
56 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58
59 streamDecodeUtf8 :: ByteString -> Decoding
60 streamDecodeUtf8 = decodeChunk B.empty 0 0
61
62 decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
63 decodeChunk old codepoint0 state0 bs@(PS fp off len) =
64 runST $ do marray <- A.new (len+1)
65 unsafeIOToST (decodeChunkToBuffer marray)
66 where
67 decodeChunkToBuffer :: A.MArray s -> IO Decoding
68 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
69 with (0::CSize) $ \destOffPtr ->
70 with codepoint0 $ \codepointPtr ->
71 with state0 $ \statePtr ->
72 with nullPtr $ \curPtrPtr ->
73 do let end = ptr `plusPtr` (off + len)
74 curPtr = ptr `plusPtr` off
75 poke curPtrPtr curPtr
76 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
77 state <- peek statePtr
78 lastPtr <- peek curPtrPtr
79 codepoint <- peek codepointPtr
80 n <- peek destOffPtr
81 chunkText <- mkText dest n
82 let left = lastPtr `minusPtr` curPtr
83 remaining = B.drop left bs
84 accum = if T.null chunkText then B.append old remaining else remaining
85 return $ case state of
86 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
87 _ -> Some chunkText accum (decodeChunk accum codepoint state)
88
89
90 mkText :: A.MArray s -> CSize -> IO Text
91 mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
92 return $! textP arr 0 (fromIntegral n)
93
94 ord :: Char -> Int
95 ord (C# c#) = I# (ord# c#)
96 {-# INLINE ord #-}
97
98 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
99 unsafeWrite marr i c
100 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
101 return 1
102 | otherwise = do A.unsafeWrite marr i lo
103 A.unsafeWrite marr (i+1) hi
104 return 2
105 where n = ord c
106 m = n - 0x10000
107 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
108 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
109 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
110 {-# INLINE unsafeWrite #-}
111
112 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
113 :: MutableByteArray# s -> Ptr CSize
114 -> Ptr (Ptr Word8) -> Ptr Word8
115 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)