From 409759e855afc27cfab263c1ab1b1fd9ab66d38a Mon Sep 17 00:00:00 2001 From: michaelt Date: Tue, 4 Feb 2014 21:47:27 -0500 Subject: moved internals --- Pipes/Text/Internal/Codec.hs | 215 ++++++++++++++++++++++++++++++++++++++++ Pipes/Text/Internal/Decoding.hs | 147 +++++++++++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100644 Pipes/Text/Internal/Codec.hs create mode 100644 Pipes/Text/Internal/Decoding.hs (limited to 'Pipes/Text') diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs new file mode 100644 index 0000000..4b9367f --- /dev/null +++ b/Pipes/Text/Internal/Codec.hs @@ -0,0 +1,215 @@ + +{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} +-- | +-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin +-- License: MIT +-- +-- Parts of this code were taken from enumerator and conduits, and adapted for pipes. + +module Pipes.Text.Internal.Codec + ( Decoding(..) + , streamDecodeUtf8 + , decodeSomeUtf8 + , Codec(..) + , TextException(..) + , utf8 + , utf16_le + , utf16_be + , utf32_le + , utf32_be + ) where + +import Data.Bits ((.&.)) +import Data.Char (ord) +import Data.ByteString as B +import Data.ByteString (ByteString) +import Data.ByteString.Internal as B +import Data.ByteString.Char8 as B8 +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import Data.Text.Encoding.Error () +import GHC.Word (Word8, Word32) +import qualified Data.Text.Array as A +import Data.Word (Word8, Word16) +import System.IO.Unsafe (unsafePerformIO) +import qualified Control.Exception as Exc +import Data.Bits ((.&.), (.|.), shiftL) +import Data.Typeable +import Control.Arrow (first) +import Data.Maybe (catMaybes) +import Pipes.Text.Internal.Decoding +import Pipes +-- | A specific character encoding. +-- +-- Since 0.3.0 +data Codec = Codec + { codecName :: Text + , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) + , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) + } + +instance Show Codec where + showsPrec d c = showParen (d > 10) $ + showString "Codec " . shows (codecName c) + +data TextException = DecodeException Codec Word8 + | EncodeException Codec Char + | LengthExceeded Int + | TextException Exc.SomeException + deriving (Show, Typeable) +instance Exc.Exception TextException + + +toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) + -> (ByteString -> Decoding) +toDecoding op = loop B.empty where + loop !extra bs0 = case op (B.append extra bs0) of + (txt, Right bs) -> Some txt bs (loop bs) + (txt, Left (_,bs)) -> Other txt bs +-- To do: toDecoding should be inlined in each of the 'Codec' definitions +-- or else Codec changed to the conduit/enumerator definition. We have +-- altered it to use 'streamDecodeUtf8' + +splitSlowly :: (ByteString -> Text) + -> ByteString + -> (Text, Either (TextException, ByteString) ByteString) +splitSlowly dec bytes = valid where + valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) + splits 0 = [(B.empty, bytes)] + splits n = B.splitAt n bytes : splits (n - 1) + decFirst (a, b) = case tryEvaluate (dec a) of + Left _ -> Nothing + Right text -> let trouble = case tryEvaluate (dec b) of + Left exc -> Left (TextException exc, b) + Right _ -> Right B.empty + in Just (text, trouble) -- this case shouldn't occur, + -- since splitSlowly is only called + -- when parsing failed somewhere + +utf8 :: Codec +utf8 = Codec name enc (toDecoding dec) where + name = T.pack "UTF-8" + enc text = (TE.encodeUtf8 text, Nothing) + dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) + +-- -- Whether the given byte is a continuation byte. +-- isContinuation byte = byte .&. 0xC0 == 0x80 +-- +-- -- The number of continuation bytes needed by the given +-- -- non-continuation byte. Returns -1 for an illegal UTF-8 +-- -- non-continuation byte and the whole split quickly must fail so +-- -- as the input is passed to TE.decodeUtf8, which will issue a +-- -- suitable error. +-- required x0 +-- | x0 .&. 0x80 == 0x00 = 0 +-- | x0 .&. 0xE0 == 0xC0 = 1 +-- | x0 .&. 0xF0 == 0xE0 = 2 +-- | x0 .&. 0xF8 == 0xF0 = 3 +-- | otherwise = -1 +-- +-- splitQuickly bytes +-- | B.null l || req == -1 = Nothing +-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) +-- | otherwise = Just (TE.decodeUtf8 l', r') +-- where +-- (l, r) = B.spanEnd isContinuation bytes +-- req = required (B.last l) +-- l' = B.init l +-- r' = B.cons (B.last l) r + + +utf16_le :: Codec +utf16_le = Codec name enc (toDecoding dec) where + name = T.pack "UTF-16-LE" + enc text = (TE.encodeUtf16LE text, Nothing) + dec bytes = case splitQuickly bytes of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf16LE bytes + + splitQuickly bytes = maybeDecode (loop 0) where + maxN = B.length bytes + + loop n | n == maxN = decodeAll + | (n + 1) == maxN = decodeTo n + loop n = let + req = utf16Required + (B.index bytes n) + (B.index bytes (n + 1)) + decodeMore = loop $! n + req + in if n + req > maxN + then decodeTo n + else decodeMore + + decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) + decodeAll = (TE.decodeUtf16LE bytes, B.empty) + +utf16_be :: Codec +utf16_be = Codec name enc (toDecoding dec) where + name = T.pack "UTF-16-BE" + enc text = (TE.encodeUtf16BE text, Nothing) + dec bytes = case splitQuickly bytes of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf16BE bytes + + splitQuickly bytes = maybeDecode (loop 0) where + maxN = B.length bytes + + loop n | n == maxN = decodeAll + | (n + 1) == maxN = decodeTo n + loop n = let + req = utf16Required + (B.index bytes (n + 1)) + (B.index bytes n) + decodeMore = loop $! n + req + in if n + req > maxN + then decodeTo n + else decodeMore + + decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) + decodeAll = (TE.decodeUtf16BE bytes, B.empty) + +utf16Required :: Word8 -> Word8 -> Int +utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where + x :: Word16 + x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 + + +utf32_le :: Codec +utf32_le = Codec name enc (toDecoding dec) where + name = T.pack "UTF-32-LE" + enc text = (TE.encodeUtf32LE text, Nothing) + dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf32LE bs + + +utf32_be :: Codec +utf32_be = Codec name enc (toDecoding dec) where + name = T.pack "UTF-32-BE" + enc text = (TE.encodeUtf32BE text, Nothing) + dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of + Just (text, extra) -> (text, Right extra) + Nothing -> splitSlowly TE.decodeUtf32BE bs + +utf32SplitBytes :: (ByteString -> Text) + -> ByteString + -> Maybe (Text, ByteString) +utf32SplitBytes dec bytes = split where + split = maybeDecode (dec toDecode, extra) + len = B.length bytes + lenExtra = mod len 4 + + lenToDecode = len - lenExtra + (toDecode, extra) = if lenExtra == 0 + then (bytes, B.empty) + else B.splitAt lenToDecode bytes + + +tryEvaluate :: a -> Either Exc.SomeException a +tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate + +maybeDecode :: (a, b) -> Maybe (a, b) +maybeDecode (a, b) = case tryEvaluate a of + Left _ -> Nothing + Right _ -> Just (a, b) diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs new file mode 100644 index 0000000..531104a --- /dev/null +++ b/Pipes/Text/Internal/Decoding.hs @@ -0,0 +1,147 @@ +{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} +{-# LANGUAGE DeriveDataTypeable, RankNTypes #-} + +-- This module lifts assorted materials from Brian O'Sullivan's text package +-- especially Data.Text.Encoding in order to define a pipes-appropriate +-- streamDecodeUtf8 +module Pipes.Text.Internal.Decoding + ( Decoding(..) + , streamDecodeUtf8 + , decodeSomeUtf8 + ) where +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Control.Monad.ST (ST, runST) +import Data.Bits ((.&.)) +import Data.ByteString as B +import Data.ByteString (ByteString) +import Data.ByteString.Internal as B +import Data.ByteString.Char8 as B8 +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import Data.Text.Encoding.Error () +import Data.Text.Internal (Text, textP) +import Foreign.C.Types (CSize) +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Marshal.Utils (with) +import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) +import Foreign.Storable (Storable, peek, poke) +import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) +import GHC.Word (Word8, Word32) +import qualified Data.Text.Array as A +import Data.Word (Word8, Word16) +import System.IO.Unsafe (unsafePerformIO) +import qualified Control.Exception as Exc +import Data.Bits ((.&.), (.|.), shiftL) +import Data.Typeable +import Control.Arrow (first) +import Data.Maybe (catMaybes) +#include "pipes_text_cbits.h" + + + +-- | A stream oriented decoding result. +data Decoding = Some Text ByteString (ByteString -> Decoding) + | Other Text ByteString +instance Show Decoding where + showsPrec d (Some t bs _) = showParen (d > prec) $ + showString "Some " . showsPrec prec' t . + showChar ' ' . showsPrec prec' bs . + showString " _" + where prec = 10; prec' = prec + 1 + showsPrec d (Other t bs) = showParen (d > prec) $ + showString "Other " . showsPrec prec' t . + showChar ' ' . showsPrec prec' bs . + showString " _" + where prec = 10; prec' = prec + 1 + +newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) +newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) + +streamDecodeUtf8 :: ByteString -> Decoding +streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 + where + decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding + decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = + runST $ do marray <- A.new (len+1) + unsafeIOToST (decodeChunkToBuffer marray) + where + decodeChunkToBuffer :: A.MArray s -> IO Decoding + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + with nullPtr $ \curPtrPtr -> + do let end = ptr `plusPtr` (off + len) + curPtr = ptr `plusPtr` off + poke curPtrPtr curPtr + c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr + state <- peek statePtr + lastPtr <- peek curPtrPtr + codepoint <- peek codepointPtr + n <- peek destOffPtr + chunkText <- mkText dest n + let left = lastPtr `minusPtr` curPtr + remaining = B.drop left bs + accum = if T.null chunkText then B.append old remaining else remaining + return $! case state of + UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error + _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) + {-# INLINE decodeChunkToBuffer #-} + {-# INLINE decodeChunkUtf8 #-} +{-# INLINE streamDecodeUtf8 #-} + +decodeSomeUtf8 :: ByteString -> (Text, ByteString) +decodeSomeUtf8 bs@(PS fp off len) = runST $ do + dest <- A.new (len+1) + unsafeIOToST $ + withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with (0::CodePoint) $ \codepointPtr -> + with (0::DecoderState) $ \statePtr -> + with nullPtr $ \curPtrPtr -> + do let end = ptr `plusPtr` (off + len) + curPtr = ptr `plusPtr` off + poke curPtrPtr curPtr + c_decode_utf8_with_state (A.maBA dest) destOffPtr + curPtrPtr end codepointPtr statePtr + state <- peek statePtr + lastPtr <- peek curPtrPtr + codepoint <- peek codepointPtr + n <- peek destOffPtr + chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) + let left = lastPtr `minusPtr` curPtr + remaining = B.drop left bs + return $! (chunkText, remaining) +{-# INLINE decodeSomeUtf8 #-} + +mkText :: A.MArray s -> CSize -> IO Text +mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) +{-# INLINE mkText #-} + +ord :: Char -> Int +ord (C# c#) = I# (ord# c#) +{-# INLINE ord #-} + +unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int +unsafeWrite marr i c + | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) + return 1 + | otherwise = do A.unsafeWrite marr i lo + A.unsafeWrite marr (i+1) hi + return 2 + where n = ord c + m = n - 0x10000 + lo = fromIntegral $ (m `shiftR` 10) + 0xD800 + hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 + shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) + {-# INLINE shiftR #-} +{-# INLINE unsafeWrite #-} + +foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state + :: MutableByteArray# s -> Ptr CSize + -> Ptr (Ptr Word8) -> Ptr Word8 + -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file -- cgit v1.2.3