diff options
Diffstat (limited to 'Pipes/Text/Internal/Decoding.hs')
-rw-r--r-- | Pipes/Text/Internal/Decoding.hs | 154 |
1 files changed, 0 insertions, 154 deletions
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs deleted file mode 100644 index b5d928a..0000000 --- a/Pipes/Text/Internal/Decoding.hs +++ /dev/null | |||
@@ -1,154 +0,0 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} | ||
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | ||
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
5 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
6 | -- especially @Data.Text.Encoding@ in order to define a pipes-appropriate | ||
7 | -- 'streamDecodeUtf8' | ||
8 | |||
9 | module Pipes.Text.Internal.Decoding | ||
10 | ( Decoding(..) | ||
11 | , streamDecodeUtf8 | ||
12 | , decodeSomeUtf8 | ||
13 | ) where | ||
14 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
15 | import Control.Monad.ST (ST, runST) | ||
16 | import Data.Bits ((.&.)) | ||
17 | import Data.ByteString as B | ||
18 | import Data.ByteString (ByteString) | ||
19 | import Data.ByteString.Internal as B | ||
20 | import Data.ByteString.Char8 as B8 | ||
21 | import Data.Text (Text) | ||
22 | import qualified Data.Text as T | ||
23 | import qualified Data.Text.Encoding as TE | ||
24 | import Data.Text.Encoding.Error () | ||
25 | import Data.Text.Internal (Text, textP) | ||
26 | import Foreign.C.Types (CSize) | ||
27 | import Foreign.ForeignPtr (withForeignPtr) | ||
28 | import Foreign.Marshal.Utils (with) | ||
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
30 | import Foreign.Storable (Storable, peek, poke) | ||
31 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
32 | import GHC.Word (Word8, Word32) | ||
33 | import qualified Data.Text.Array as A | ||
34 | import Data.Word (Word8, Word16) | ||
35 | import System.IO.Unsafe (unsafePerformIO) | ||
36 | import qualified Control.Exception as Exc | ||
37 | import Data.Bits ((.&.), (.|.), shiftL) | ||
38 | import Data.Typeable | ||
39 | import Control.Arrow (first) | ||
40 | import Data.Maybe (catMaybes) | ||
41 | #include "pipes_text_cbits.h" | ||
42 | |||
43 | |||
44 | |||
45 | -- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding | ||
46 | |||
47 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
48 | -- Text, continuation and any undecoded fragment. | ||
49 | | Other Text ByteString | ||
50 | -- Text followed by an undecodable ByteString | ||
51 | |||
52 | instance Show Decoding where | ||
53 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
54 | showString "Some " . showsPrec prec' t . | ||
55 | showChar ' ' . showsPrec prec' bs . | ||
56 | showString " _" | ||
57 | where prec = 10; prec' = prec + 1 | ||
58 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
59 | showString "Other " . showsPrec prec' t . | ||
60 | showChar ' ' . showsPrec prec' bs . | ||
61 | showString " _" | ||
62 | where prec = 10; prec' = prec + 1 | ||
63 | |||
64 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
65 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
66 | |||
67 | -- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. | ||
68 | streamDecodeUtf8 :: ByteString -> Decoding | ||
69 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
70 | where | ||
71 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
72 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
73 | runST $ do marray <- A.new (len+1) | ||
74 | unsafeIOToST (decodeChunkToBuffer marray) | ||
75 | where | ||
76 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
77 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
78 | with (0::CSize) $ \destOffPtr -> | ||
79 | with codepoint0 $ \codepointPtr -> | ||
80 | with state0 $ \statePtr -> | ||
81 | with nullPtr $ \curPtrPtr -> | ||
82 | do let end = ptr `plusPtr` (off + len) | ||
83 | curPtr = ptr `plusPtr` off | ||
84 | poke curPtrPtr curPtr | ||
85 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
86 | state <- peek statePtr | ||
87 | lastPtr <- peek curPtrPtr | ||
88 | codepoint <- peek codepointPtr | ||
89 | n <- peek destOffPtr | ||
90 | chunkText <- mkText dest n | ||
91 | let left = lastPtr `minusPtr` curPtr | ||
92 | remaining = B.drop left bs | ||
93 | accum = if T.null chunkText then B.append old remaining else remaining | ||
94 | return $! case state of | ||
95 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
96 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
97 | {-# INLINE decodeChunkToBuffer #-} | ||
98 | {-# INLINE decodeChunkUtf8 #-} | ||
99 | {-# INLINE streamDecodeUtf8 #-} | ||
100 | |||
101 | -- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble | ||
102 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
103 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
104 | dest <- A.new (len+1) | ||
105 | unsafeIOToST $ | ||
106 | withForeignPtr fp $ \ptr -> | ||
107 | with (0::CSize) $ \destOffPtr -> | ||
108 | with (0::CodePoint) $ \codepointPtr -> | ||
109 | with (0::DecoderState) $ \statePtr -> | ||
110 | with nullPtr $ \curPtrPtr -> | ||
111 | do let end = ptr `plusPtr` (off + len) | ||
112 | curPtr = ptr `plusPtr` off | ||
113 | poke curPtrPtr curPtr | ||
114 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
115 | curPtrPtr end codepointPtr statePtr | ||
116 | state <- peek statePtr | ||
117 | lastPtr <- peek curPtrPtr | ||
118 | codepoint <- peek codepointPtr | ||
119 | n <- peek destOffPtr | ||
120 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
121 | return $! textP arr 0 (fromIntegral n) | ||
122 | let left = lastPtr `minusPtr` curPtr | ||
123 | remaining = B.drop left bs | ||
124 | return $! (chunkText, remaining) | ||
125 | {-# INLINE decodeSomeUtf8 #-} | ||
126 | |||
127 | mkText :: A.MArray s -> CSize -> IO Text | ||
128 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
129 | return $! textP arr 0 (fromIntegral n) | ||
130 | {-# INLINE mkText #-} | ||
131 | |||
132 | ord :: Char -> Int | ||
133 | ord (C# c#) = I# (ord# c#) | ||
134 | {-# INLINE ord #-} | ||
135 | |||
136 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
137 | unsafeWrite marr i c | ||
138 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
139 | return 1 | ||
140 | | otherwise = do A.unsafeWrite marr i lo | ||
141 | A.unsafeWrite marr (i+1) hi | ||
142 | return 2 | ||
143 | where n = ord c | ||
144 | m = n - 0x10000 | ||
145 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
146 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
147 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
148 | {-# INLINE shiftR #-} | ||
149 | {-# INLINE unsafeWrite #-} | ||
150 | |||
151 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
152 | :: MutableByteArray# s -> Ptr CSize | ||
153 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
154 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||