1 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2 {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3 {-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
5 -- This module lifts assorted materials from Brian O'Sullivan's text package
6 -- especially Data.Text.Encoding in order to define a pipes-appropriate
8 module Pipes.Text.Internal
16 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
17 import Control.Monad.ST (ST, runST)
18 import Data.Bits ((.&.))
19 import Data.ByteString as B
20 import Data.ByteString (ByteString)
21 import Data.ByteString.Internal as B
22 import Data.ByteString.Char8 as B8
23 import Data.Text (Text)
24 import qualified Data.Text as T
25 import qualified Data.Text.Encoding as TE
26 import Data.Text.Encoding.Error ()
27 import Data.Text.Internal (Text, textP)
28 import Foreign.C.Types (CSize)
29 import Foreign.ForeignPtr (withForeignPtr)
30 import Foreign.Marshal.Utils (with)
31 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
32 import Foreign.Storable (Storable, peek, poke)
33 import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
34 import GHC.Word (Word8, Word32)
35 import qualified Data.Text.Array as A
36 import Data.Word (Word8, Word16)
37 import System.IO.Unsafe (unsafePerformIO)
38 import qualified Control.Exception as Exc
39 import Data.Bits ((.&.), (.|.), shiftL)
41 import Control.Arrow (first)
42 import Data.Maybe (catMaybes)
43 #include "pipes_text_cbits.h"
46 -- | A specific character encoding.
51 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
55 instance Show Codec where
56 showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
59 data TextException = DecodeException Codec Word8
60 | EncodeException Codec Char
62 | TextException Exc.SomeException
63 deriving (Show, Typeable)
64 instance Exc.Exception TextException
66 toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
67 -> (ByteString -> Decoding)
68 toDecoding op = loop B.empty where
69 loop extra bs0 = case op (B.append extra bs0) of
70 (txt, Right bs) -> Some txt bs (loop bs)
71 (txt, Left (_,bs)) -> Other txt bs
74 splitSlowly :: (ByteString -> Text)
76 -> (Text, Either (TextException, ByteString) ByteString)
77 splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble)
87 -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
92 utf8 = Codec name enc (toDecoding dec) where
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of
98 -- -- Whether the given byte is a continuation byte.
99 -- isContinuation byte = byte .&. 0xC0 == 0x80
101 -- -- The number of continuation bytes needed by the given
102 -- -- non-continuation byte. Returns -1 for an illegal UTF-8
103 -- -- non-continuation byte and the whole split quickly must fail so
104 -- -- as the input is passed to TE.decodeUtf8, which will issue a
105 -- -- suitable error.
107 -- | x0 .&. 0x80 == 0x00 = 0
108 -- | x0 .&. 0xE0 == 0xC0 = 1
109 -- | x0 .&. 0xF0 == 0xE0 = 2
110 -- | x0 .&. 0xF8 == 0xF0 = 3
113 -- splitQuickly bytes
114 -- | B.null l || req == -1 = Nothing
115 -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
116 -- | otherwise = Just (TE.decodeUtf8 l', r')
118 -- (l, r) = B.spanEnd isContinuation bytes
119 -- req = required (B.last l)
121 -- r' = B.cons (B.last l) r
126 utf16_le = Codec name enc (toDecoding dec) where
127 name = T.pack "UTF-16-LE"
128 enc text = (TE.encodeUtf16LE text, Nothing)
129 dec bytes = case splitQuickly bytes of
130 Just (text, extra) -> (text, Right extra)
131 Nothing -> splitSlowly TE.decodeUtf16LE bytes
133 splitQuickly bytes = maybeDecode (loop 0) where
134 maxN = B.length bytes
136 loop n | n == maxN = decodeAll
137 | (n + 1) == maxN = decodeTo n
141 (B.index bytes (n + 1))
142 decodeMore = loop $! n + req
147 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
153 utf16_be = Codec name enc (toDecoding dec) where
154 name = T.pack "UTF-16-BE"
155 enc text = (TE.encodeUtf16BE text, Nothing)
156 dec bytes = case splitQuickly bytes of
157 Just (text, extra) -> (text, Right extra)
158 Nothing -> splitSlowly TE.decodeUtf16BE bytes
160 splitQuickly bytes = maybeDecode (loop 0) where
161 maxN = B.length bytes
163 loop n | n == maxN = decodeAll
164 | (n + 1) == maxN = decodeTo n
167 (B.index bytes (n + 1))
169 decodeMore = loop $! n + req
174 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
175 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
177 utf16Required :: Word8 -> Word8 -> Int
178 utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
180 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
185 utf32_le = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-LE"
187 enc text = (TE.encodeUtf32LE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32LE bs
195 utf32_be = Codec name enc (toDecoding dec) where
196 name = T.pack "UTF-32-BE"
197 enc text = (TE.encodeUtf32BE text, Nothing)
198 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
199 Just (text, extra) -> (text, Right extra)
200 Nothing -> splitSlowly TE.decodeUtf32BE bs
202 utf32SplitBytes :: (ByteString -> Text)
204 -> Maybe (Text, ByteString)
205 utf32SplitBytes dec bytes = split where
206 split = maybeDecode (dec toDecode, extra)
210 lenToDecode = len - lenExtra
211 (toDecode, extra) = if lenExtra == 0
212 then (bytes, B.empty)
213 else B.splitAt lenToDecode bytes
218 ascii = Codec name enc (toDecoding dec) where
219 name = T.pack "ASCII"
220 enc text = (bytes, extra) where
221 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
222 bytes = B8.pack (T.unpack safe)
223 extra = if T.null unsafe
225 else Just (EncodeException ascii (T.head unsafe), unsafe)
227 dec bytes = (text, extra) where
228 (safe, unsafe) = B.span (<= 0x7F) bytes
229 text = T.pack (B8.unpack safe)
230 extra = if B.null unsafe
232 else Left (DecodeException ascii (B.head unsafe), unsafe)
237 iso8859_1 = Codec name enc (toDecoding dec) where
238 name = T.pack "ISO-8859-1"
239 enc text = (bytes, extra) where
240 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
241 bytes = B8.pack (T.unpack safe)
242 extra = if T.null unsafe
244 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
246 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
248 tryEvaluate :: a -> Either Exc.SomeException a
249 tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
251 maybeDecode :: (a, b) -> Maybe (a, b)
252 maybeDecode (a, b) = case tryEvaluate a of
254 Right _ -> Just (a, b)
256 -- | A stream oriented decoding result.
257 data Decoding = Some Text ByteString (ByteString -> Decoding)
258 | Other Text ByteString
259 instance Show Decoding where
260 showsPrec d (Some t bs _) = showParen (d > prec) $
261 showString "Some " . showsPrec prec' t .
262 showChar ' ' . showsPrec prec' bs .
264 where prec = 10; prec' = prec + 1
265 showsPrec d (Other t bs) = showParen (d > prec) $
266 showString "Other " . showsPrec prec' t .
267 showChar ' ' . showsPrec prec' bs .
269 where prec = 10; prec' = prec + 1
271 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
272 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
274 streamDecodeUtf8 :: ByteString -> Decoding
275 streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
277 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
278 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
279 runST $ do marray <- A.new (len+1)
280 unsafeIOToST (decodeChunkToBuffer marray)
282 decodeChunkToBuffer :: A.MArray s -> IO Decoding
283 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
284 with (0::CSize) $ \destOffPtr ->
285 with codepoint0 $ \codepointPtr ->
286 with state0 $ \statePtr ->
287 with nullPtr $ \curPtrPtr ->
288 do let end = ptr `plusPtr` (off + len)
289 curPtr = ptr `plusPtr` off
290 poke curPtrPtr curPtr
291 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
292 state <- peek statePtr
293 lastPtr <- peek curPtrPtr
294 codepoint <- peek codepointPtr
296 chunkText <- mkText dest n
297 let left = lastPtr `minusPtr` curPtr
298 remaining = B.drop left bs
299 accum = if T.null chunkText then B.append old remaining else remaining
300 return $! case state of
301 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
302 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
303 {-# INLINE decodeChunkToBuffer #-}
304 {-# INLINE decodeChunkUtf8 #-}
305 {-# INLINE streamDecodeUtf8 #-}
307 decodeSomeUtf8 :: ByteString -> (Text, ByteString)
308 decodeSomeUtf8 bs@(PS fp off len) = runST $ do
309 dest <- A.new (len+1)
311 withForeignPtr fp $ \ptr ->
312 with (0::CSize) $ \destOffPtr ->
313 with (0::CodePoint) $ \codepointPtr ->
314 with (0::DecoderState) $ \statePtr ->
315 with nullPtr $ \curPtrPtr ->
316 do let end = ptr `plusPtr` (off + len)
317 curPtr = ptr `plusPtr` off
318 poke curPtrPtr curPtr
319 c_decode_utf8_with_state (A.maBA dest) destOffPtr
320 curPtrPtr end codepointPtr statePtr
321 state <- peek statePtr
322 lastPtr <- peek curPtrPtr
323 codepoint <- peek codepointPtr
325 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
326 return $! textP arr 0 (fromIntegral n)
327 let left = lastPtr `minusPtr` curPtr
328 remaining = B.drop left bs
329 return $! (chunkText, remaining)
330 {-# INLINE decodeSomeUtf8 #-}
332 mkText :: A.MArray s -> CSize -> IO Text
333 mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
334 return $! textP arr 0 (fromIntegral n)
335 {-# INLINE mkText #-}
338 ord (C# c#) = I# (ord# c#)
341 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
343 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
345 | otherwise = do A.unsafeWrite marr i lo
346 A.unsafeWrite marr (i+1) hi
350 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
351 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
352 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
353 {-# INLINE shiftR #-}
354 {-# INLINE unsafeWrite #-}
356 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
357 :: MutableByteArray# s -> Ptr CSize
358 -> Ptr (Ptr Word8) -> Ptr Word8
359 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)