aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text/Internal/Decoding.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text/Internal/Decoding.hs')
-rw-r--r--Pipes/Text/Internal/Decoding.hs147
1 files changed, 147 insertions, 0 deletions
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
new file mode 100644
index 0000000..531104a
--- /dev/null
+++ b/Pipes/Text/Internal/Decoding.hs
@@ -0,0 +1,147 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5-- This module lifts assorted materials from Brian O'Sullivan's text package
6-- especially Data.Text.Encoding in order to define a pipes-appropriate
7-- streamDecodeUtf8
8module Pipes.Text.Internal.Decoding
9 ( Decoding(..)
10 , streamDecodeUtf8
11 , decodeSomeUtf8
12 ) where
13import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14import Control.Monad.ST (ST, runST)
15import Data.Bits ((.&.))
16import Data.ByteString as B
17import Data.ByteString (ByteString)
18import Data.ByteString.Internal as B
19import Data.ByteString.Char8 as B8
20import Data.Text (Text)
21import qualified Data.Text as T
22import qualified Data.Text.Encoding as TE
23import Data.Text.Encoding.Error ()
24import Data.Text.Internal (Text, textP)
25import Foreign.C.Types (CSize)
26import Foreign.ForeignPtr (withForeignPtr)
27import Foreign.Marshal.Utils (with)
28import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
29import Foreign.Storable (Storable, peek, poke)
30import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
31import GHC.Word (Word8, Word32)
32import qualified Data.Text.Array as A
33import Data.Word (Word8, Word16)
34import System.IO.Unsafe (unsafePerformIO)
35import qualified Control.Exception as Exc
36import Data.Bits ((.&.), (.|.), shiftL)
37import Data.Typeable
38import Control.Arrow (first)
39import Data.Maybe (catMaybes)
40#include "pipes_text_cbits.h"
41
42
43
44-- | A stream oriented decoding result.
45data Decoding = Some Text ByteString (ByteString -> Decoding)
46 | Other Text ByteString
47instance Show Decoding where
48 showsPrec d (Some t bs _) = showParen (d > prec) $
49 showString "Some " . showsPrec prec' t .
50 showChar ' ' . showsPrec prec' bs .
51 showString " _"
52 where prec = 10; prec' = prec + 1
53 showsPrec d (Other t bs) = showParen (d > prec) $
54 showString "Other " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
56 showString " _"
57 where prec = 10; prec' = prec + 1
58
59newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
60newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
61
62streamDecodeUtf8 :: ByteString -> Decoding
63streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
64 where
65 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
66 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
67 runST $ do marray <- A.new (len+1)
68 unsafeIOToST (decodeChunkToBuffer marray)
69 where
70 decodeChunkToBuffer :: A.MArray s -> IO Decoding
71 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
72 with (0::CSize) $ \destOffPtr ->
73 with codepoint0 $ \codepointPtr ->
74 with state0 $ \statePtr ->
75 with nullPtr $ \curPtrPtr ->
76 do let end = ptr `plusPtr` (off + len)
77 curPtr = ptr `plusPtr` off
78 poke curPtrPtr curPtr
79 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
80 state <- peek statePtr
81 lastPtr <- peek curPtrPtr
82 codepoint <- peek codepointPtr
83 n <- peek destOffPtr
84 chunkText <- mkText dest n
85 let left = lastPtr `minusPtr` curPtr
86 remaining = B.drop left bs
87 accum = if T.null chunkText then B.append old remaining else remaining
88 return $! case state of
89 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
90 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
91 {-# INLINE decodeChunkToBuffer #-}
92 {-# INLINE decodeChunkUtf8 #-}
93{-# INLINE streamDecodeUtf8 #-}
94
95decodeSomeUtf8 :: ByteString -> (Text, ByteString)
96decodeSomeUtf8 bs@(PS fp off len) = runST $ do
97 dest <- A.new (len+1)
98 unsafeIOToST $
99 withForeignPtr fp $ \ptr ->
100 with (0::CSize) $ \destOffPtr ->
101 with (0::CodePoint) $ \codepointPtr ->
102 with (0::DecoderState) $ \statePtr ->
103 with nullPtr $ \curPtrPtr ->
104 do let end = ptr `plusPtr` (off + len)
105 curPtr = ptr `plusPtr` off
106 poke curPtrPtr curPtr
107 c_decode_utf8_with_state (A.maBA dest) destOffPtr
108 curPtrPtr end codepointPtr statePtr
109 state <- peek statePtr
110 lastPtr <- peek curPtrPtr
111 codepoint <- peek codepointPtr
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
114 return $! textP arr 0 (fromIntegral n)
115 let left = lastPtr `minusPtr` curPtr
116 remaining = B.drop left bs
117 return $! (chunkText, remaining)
118{-# INLINE decodeSomeUtf8 #-}
119
120mkText :: A.MArray s -> CSize -> IO Text
121mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
122 return $! textP arr 0 (fromIntegral n)
123{-# INLINE mkText #-}
124
125ord :: Char -> Int
126ord (C# c#) = I# (ord# c#)
127{-# INLINE ord #-}
128
129unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
130unsafeWrite marr i c
131 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
132 return 1
133 | otherwise = do A.unsafeWrite marr i lo
134 A.unsafeWrite marr (i+1) hi
135 return 2
136 where n = ord c
137 m = n - 0x10000
138 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
139 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
140 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
141 {-# INLINE shiftR #-}
142{-# INLINE unsafeWrite #-}
143
144foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
145 :: MutableByteArray# s -> Ptr CSize
146 -> Ptr (Ptr Word8) -> Ptr Word8
147 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file