]> git.immae.eu Git - github/fretlink/text-pipes.git/blame - Pipes/Text/Internal.hs
repaired tests, clean encodeUtf8 to return Done r rather than yield B.empty >> Done...
[github/fretlink/text-pipes.git] / Pipes / Text / Internal.hs
CommitLineData
8c482809 1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
c9d1c945 3-- This module lifts assorted materials from Brian O'Sullivan's text package
8c482809 4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8
6module Pipes.Text.Internal
7 ( Decoding(..)
8c482809 8 , streamDecodeUtf8
9 ) where
10
11import Control.Exception (evaluate, try)
12#if __GLASGOW_HASKELL__ >= 702
13import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14import Control.Monad.ST (ST, runST)
15#else
16import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
17#endif
18import Data.Bits ((.&.))
19import Data.ByteString as B
20import Data.ByteString.Internal as B
21import Data.Text ()
c9d1c945 22import qualified Data.Text as T
8c482809 23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24import Data.Text.Internal (Text(..), safe, textP)
25import Data.Word (Word8, Word32)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base hiding (ord)
32import GHC.Word
33import qualified Data.Text.Array as A
34import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39#include "pipes_text_cbits.h"
40
41-- | A stream oriented decoding result.
42data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
48 showString " _"
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55
56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58
8c482809 59streamDecodeUtf8 :: ByteString -> Decoding
c9d1c945 60streamDecodeUtf8 = decodeChunk B.empty 0 0
8c482809 61
c9d1c945 62decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
63decodeChunk old codepoint0 state0 bs@(PS fp off len) =
64 runST $ do marray <- A.new (len+1)
65 unsafeIOToST (decodeChunkToBuffer marray)
8c482809 66 where
c9d1c945 67 decodeChunkToBuffer :: A.MArray s -> IO Decoding
68 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
69 with (0::CSize) $ \destOffPtr ->
70 with codepoint0 $ \codepointPtr ->
71 with state0 $ \statePtr ->
72 with nullPtr $ \curPtrPtr ->
73 do let end = ptr `plusPtr` (off + len)
74 curPtr = ptr `plusPtr` off
75 poke curPtrPtr curPtr
76 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
77 state <- peek statePtr
78 lastPtr <- peek curPtrPtr
79 codepoint <- peek codepointPtr
80 n <- peek destOffPtr
81 chunkText <- mkText dest n
82 let left = lastPtr `minusPtr` curPtr
83 remaining = B.drop left bs
84 accum = if T.null chunkText then B.append old remaining else remaining
85 return $ case state of
86 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
87 _ -> Some chunkText accum (decodeChunk accum codepoint state)
88
89
90mkText :: A.MArray s -> CSize -> IO Text
91mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
92 return $! textP arr 0 (fromIntegral n)
8c482809 93
94ord :: Char -> Int
95ord (C# c#) = I# (ord# c#)
96{-# INLINE ord #-}
97
8c482809 98unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
99unsafeWrite marr i c
c9d1c945 100 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
101 return 1
102 | otherwise = do A.unsafeWrite marr i lo
103 A.unsafeWrite marr (i+1) hi
104 return 2
8c482809 105 where n = ord c
106 m = n - 0x10000
107 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
108 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
109 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
110{-# INLINE unsafeWrite #-}
111
112foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
113 :: MutableByteArray# s -> Ptr CSize
114 -> Ptr (Ptr Word8) -> Ptr Word8
115 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)