]> git.immae.eu Git - github/fretlink/text-pipes.git/blob - Pipes/Text/Internal/Decoding.hs
cabal file more descriptive
[github/fretlink/text-pipes.git] / Pipes / Text / Internal / Decoding.hs
1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2 {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3 {-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5 {- |
6 This module lifts assorted materials from Brian O'Sullivan's text package
7 especially @Data.Text.Encoding@ in order to define a pipes-appropriate
8 'streamDecodeUtf8'
9 -}
10 module Pipes.Text.Internal.Decoding
11 ( Decoding(..)
12 , streamDecodeUtf8
13 , decodeSomeUtf8
14 ) where
15 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
16 import Control.Monad.ST (ST, runST)
17 import Data.Bits ((.&.))
18 import Data.ByteString as B
19 import Data.ByteString (ByteString)
20 import Data.ByteString.Internal as B
21 import Data.ByteString.Char8 as B8
22 import Data.Text (Text)
23 import qualified Data.Text as T
24 import qualified Data.Text.Encoding as TE
25 import Data.Text.Encoding.Error ()
26 import Data.Text.Internal (Text, textP)
27 import Foreign.C.Types (CSize)
28 import Foreign.ForeignPtr (withForeignPtr)
29 import Foreign.Marshal.Utils (with)
30 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
31 import Foreign.Storable (Storable, peek, poke)
32 import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
33 import GHC.Word (Word8, Word32)
34 import qualified Data.Text.Array as A
35 import Data.Word (Word8, Word16)
36 import System.IO.Unsafe (unsafePerformIO)
37 import qualified Control.Exception as Exc
38 import Data.Bits ((.&.), (.|.), shiftL)
39 import Data.Typeable
40 import Control.Arrow (first)
41 import Data.Maybe (catMaybes)
42 #include "pipes_text_cbits.h"
43
44
45
46 -- | A stream oriented decoding result. Distinct from the similar type in @Data.Text.Encoding@
47 data Decoding = Some Text ByteString (ByteString -> Decoding) -- | Text, continuation and any undecoded fragment.
48 | Other Text ByteString -- | Text followed by an undecodable ByteString
49 instance Show Decoding where
50 showsPrec d (Some t bs _) = showParen (d > prec) $
51 showString "Some " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55 showsPrec d (Other t bs) = showParen (d > prec) $
56 showString "Other " . showsPrec prec' t .
57 showChar ' ' . showsPrec prec' bs .
58 showString " _"
59 where prec = 10; prec' = prec + 1
60
61 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
62 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
63
64 -- | Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'.
65 streamDecodeUtf8 :: ByteString -> Decoding
66 streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
67 where
68 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
69 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
70 runST $ do marray <- A.new (len+1)
71 unsafeIOToST (decodeChunkToBuffer marray)
72 where
73 decodeChunkToBuffer :: A.MArray s -> IO Decoding
74 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
75 with (0::CSize) $ \destOffPtr ->
76 with codepoint0 $ \codepointPtr ->
77 with state0 $ \statePtr ->
78 with nullPtr $ \curPtrPtr ->
79 do let end = ptr `plusPtr` (off + len)
80 curPtr = ptr `plusPtr` off
81 poke curPtrPtr curPtr
82 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
83 state <- peek statePtr
84 lastPtr <- peek curPtrPtr
85 codepoint <- peek codepointPtr
86 n <- peek destOffPtr
87 chunkText <- mkText dest n
88 let left = lastPtr `minusPtr` curPtr
89 remaining = B.drop left bs
90 accum = if T.null chunkText then B.append old remaining else remaining
91 return $! case state of
92 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
93 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
94 {-# INLINE decodeChunkToBuffer #-}
95 {-# INLINE decodeChunkUtf8 #-}
96 {-# INLINE streamDecodeUtf8 #-}
97
98 -- | Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
99 decodeSomeUtf8 :: ByteString -> (Text, ByteString)
100 decodeSomeUtf8 bs@(PS fp off len) = runST $ do
101 dest <- A.new (len+1)
102 unsafeIOToST $
103 withForeignPtr fp $ \ptr ->
104 with (0::CSize) $ \destOffPtr ->
105 with (0::CodePoint) $ \codepointPtr ->
106 with (0::DecoderState) $ \statePtr ->
107 with nullPtr $ \curPtrPtr ->
108 do let end = ptr `plusPtr` (off + len)
109 curPtr = ptr `plusPtr` off
110 poke curPtrPtr curPtr
111 c_decode_utf8_with_state (A.maBA dest) destOffPtr
112 curPtrPtr end codepointPtr statePtr
113 state <- peek statePtr
114 lastPtr <- peek curPtrPtr
115 codepoint <- peek codepointPtr
116 n <- peek destOffPtr
117 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
118 return $! textP arr 0 (fromIntegral n)
119 let left = lastPtr `minusPtr` curPtr
120 remaining = B.drop left bs
121 return $! (chunkText, remaining)
122 {-# INLINE decodeSomeUtf8 #-}
123
124 mkText :: A.MArray s -> CSize -> IO Text
125 mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
126 return $! textP arr 0 (fromIntegral n)
127 {-# INLINE mkText #-}
128
129 ord :: Char -> Int
130 ord (C# c#) = I# (ord# c#)
131 {-# INLINE ord #-}
132
133 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
134 unsafeWrite marr i c
135 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
136 return 1
137 | otherwise = do A.unsafeWrite marr i lo
138 A.unsafeWrite marr (i+1) hi
139 return 2
140 where n = ord c
141 m = n - 0x10000
142 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
143 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
144 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
145 {-# INLINE shiftR #-}
146 {-# INLINE unsafeWrite #-}
147
148 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
149 :: MutableByteArray# s -> Ptr CSize
150 -> Ptr (Ptr Word8) -> Ptr Word8
151 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)