]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Internal.hs
moved enumerator/conduit Codec business to its own module
[github/fretlink/text-pipes.git] / Pipes / Text / Internal.hs
1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2 {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3 {-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5 -- This module lifts assorted materials from Brian O'Sullivan's text package
6 -- especially Data.Text.Encoding in order to define a pipes-appropriate
7 -- streamDecodeUtf8
8 module Pipes.Text.Internal
9 ( Decoding(..)
10 , streamDecodeUtf8
11 , decodeSomeUtf8
12 ) where
13 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14 import Control.Monad.ST (ST, runST)
15 import Data.Bits ((.&.))
16 import Data.ByteString as B
17 import Data.ByteString (ByteString)
18 import Data.ByteString.Internal as B
19 import Data.ByteString.Char8 as B8
20 import Data.Text (Text)
21 import qualified Data.Text as T
22 import qualified Data.Text.Encoding as TE
23 import Data.Text.Encoding.Error ()
24 import Data.Text.Internal (Text, textP)
25 import Foreign.C.Types (CSize)
26 import Foreign.ForeignPtr (withForeignPtr)
27 import Foreign.Marshal.Utils (with)
28 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
29 import Foreign.Storable (Storable, peek, poke)
30 import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
31 import GHC.Word (Word8, Word32)
32 import qualified Data.Text.Array as A
33 import Data.Word (Word8, Word16)
34 import System.IO.Unsafe (unsafePerformIO)
35 import qualified Control.Exception as Exc
36 import Data.Bits ((.&.), (.|.), shiftL)
37 import Data.Typeable
38 import Control.Arrow (first)
39 import Data.Maybe (catMaybes)
40 #include "pipes_text_cbits.h"
41
42
43
44 -- | A stream oriented decoding result.
45 data Decoding = Some Text ByteString (ByteString -> Decoding)
46 | Other Text ByteString
47 instance Show Decoding where
48 showsPrec d (Some t bs _) = showParen (d > prec) $
49 showString "Some " . showsPrec prec' t .
50 showChar ' ' . showsPrec prec' bs .
51 showString " _"
52 where prec = 10; prec' = prec + 1
53 showsPrec d (Other t bs) = showParen (d > prec) $
54 showString "Other " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
56 showString " _"
57 where prec = 10; prec' = prec + 1
58
59 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
60 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
61
62 streamDecodeUtf8 :: ByteString -> Decoding
63 streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
64 where
65 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
66 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
67 runST $ do marray <- A.new (len+1)
68 unsafeIOToST (decodeChunkToBuffer marray)
69 where
70 decodeChunkToBuffer :: A.MArray s -> IO Decoding
71 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
72 with (0::CSize) $ \destOffPtr ->
73 with codepoint0 $ \codepointPtr ->
74 with state0 $ \statePtr ->
75 with nullPtr $ \curPtrPtr ->
76 do let end = ptr `plusPtr` (off + len)
77 curPtr = ptr `plusPtr` off
78 poke curPtrPtr curPtr
79 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
80 state <- peek statePtr
81 lastPtr <- peek curPtrPtr
82 codepoint <- peek codepointPtr
83 n <- peek destOffPtr
84 chunkText <- mkText dest n
85 let left = lastPtr `minusPtr` curPtr
86 remaining = B.drop left bs
87 accum = if T.null chunkText then B.append old remaining else remaining
88 return $! case state of
89 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
90 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
91 {-# INLINE decodeChunkToBuffer #-}
92 {-# INLINE decodeChunkUtf8 #-}
93 {-# INLINE streamDecodeUtf8 #-}
94
95 decodeSomeUtf8 :: ByteString -> (Text, ByteString)
96 decodeSomeUtf8 bs@(PS fp off len) = runST $ do
97 dest <- A.new (len+1)
98 unsafeIOToST $
99 withForeignPtr fp $ \ptr ->
100 with (0::CSize) $ \destOffPtr ->
101 with (0::CodePoint) $ \codepointPtr ->
102 with (0::DecoderState) $ \statePtr ->
103 with nullPtr $ \curPtrPtr ->
104 do let end = ptr `plusPtr` (off + len)
105 curPtr = ptr `plusPtr` off
106 poke curPtrPtr curPtr
107 c_decode_utf8_with_state (A.maBA dest) destOffPtr
108 curPtrPtr end codepointPtr statePtr
109 state <- peek statePtr
110 lastPtr <- peek curPtrPtr
111 codepoint <- peek codepointPtr
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
114 return $! textP arr 0 (fromIntegral n)
115 let left = lastPtr `minusPtr` curPtr
116 remaining = B.drop left bs
117 return $! (chunkText, remaining)
118 {-# INLINE decodeSomeUtf8 #-}
119
120 mkText :: A.MArray s -> CSize -> IO Text
121 mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
122 return $! textP arr 0 (fromIntegral n)
123 {-# INLINE mkText #-}
124
125 ord :: Char -> Int
126 ord (C# c#) = I# (ord# c#)
127 {-# INLINE ord #-}
128
129 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
130 unsafeWrite marr i c
131 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
132 return 1
133 | otherwise = do A.unsafeWrite marr i lo
134 A.unsafeWrite marr (i+1) hi
135 return 2
136 where n = ord c
137 m = n - 0x10000
138 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
139 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
140 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
141 {-# INLINE shiftR #-}
142 {-# INLINE unsafeWrite #-}
143
144 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
145 :: MutableByteArray# s -> Ptr CSize
146 -> Ptr (Ptr Word8) -> Ptr Word8
147 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)