From bbdfd3056da4992e18d3983fd5992bee23af93af Mon Sep 17 00:00:00 2001 From: michaelt Date: Sat, 15 Feb 2014 10:01:48 -0500 Subject: use new text-stream-decoding --- Pipes/Text/Encoding.hs | 205 ++++++++++++++++++++++++++++++++++++++ Pipes/Text/IO.hs | 96 ++++++++++++++++++ Pipes/Text/Internal.hs | 7 -- Pipes/Text/Internal/Codec.hs | 216 ---------------------------------------- Pipes/Text/Internal/Decoding.hs | 154 ---------------------------- 5 files changed, 301 insertions(+), 377 deletions(-) create mode 100644 Pipes/Text/Encoding.hs create mode 100644 Pipes/Text/IO.hs delete mode 100644 Pipes/Text/Internal.hs delete mode 100644 Pipes/Text/Internal/Codec.hs delete mode 100644 Pipes/Text/Internal/Decoding.hs (limited to 'Pipes/Text') diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs new file mode 100644 index 0000000..2bb5807 --- /dev/null +++ b/Pipes/Text/Encoding.hs @@ -0,0 +1,205 @@ + +{-# LANGUAGE RankNTypes, BangPatterns #-} +-- | +-- Copyright: 2014 Michael Thompson +-- +-- This module uses the stream decoding functions from the text-stream-decoding package +-- to define pipes decoding functions and lenses. + +module Pipes.Text.Encoding + ( DecodeResult (..) + , Codec + , decodeUtf8 + , decodeUtf8Pure + , decodeUtf16LE + , decodeUtf16BE + , decodeUtf32LE + , decodeUtf32BE + , utf8 + , utf8Pure + , utf16LE + , utf16BE + , utf32LE + , utf32BE + , encodeAscii + , decodeAscii + , encodeIso8859_1 + , decodeIso8859_1 + ) + where + +import Data.Char (ord) +import Data.ByteString as B +import Data.ByteString (ByteString) +import Data.ByteString.Internal as B +import Data.ByteString.Char8 as B8 +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import Data.Text.StreamDecoding +import GHC.Word (Word8, Word32) +import Data.Word (Word8, Word16) +import Control.Monad +import Pipes +import Pipes.Core + + + +{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. + They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. + The stream of text they 'see' in a bytestream ends by returning the original byte stream + beginning at the point of failure, or the empty bytestream with its return value. + -} +type Codec = forall f m r . (Functor f , Monad m ) => + (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r))) + -> Producer ByteString m r -> f (Producer ByteString m r ) + +decodeStream :: Monad m + => (B.ByteString -> DecodeResult) + -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeStream = loop where + loop dec0 p = + do x <- lift (next p) + case x of Left r -> return (return r) + Right (chunk, p') -> case dec0 chunk of + DecodeResultSuccess text dec -> do yield text + loop dec p' + DecodeResultFailure text bs -> do yield text + return (do yield bs + p') +{-# INLINABLE decodeStream#-} + +decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8 = decodeStream streamUtf8 +{-# INLINE decodeUtf8 #-} + +decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8Pure = decodeStream streamUtf8Pure +{-# INLINE decodeUtf8Pure #-} + +decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf16LE = decodeStream streamUtf16LE +{-# INLINE decodeUtf16LE #-} + +decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf16BE = decodeStream streamUtf16BE +{-# INLINE decodeUtf16BE #-} + +decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf32LE = decodeStream streamUtf32LE +{-# INLINE decodeUtf32LE #-} + +decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf32BE = decodeStream streamUtf32BE +{-# INLINE decodeUtf32BE #-} + +mkCodec :: (forall r m . Monad m => + Producer ByteString m r -> Producer Text m (Producer ByteString m r )) + -> (Text -> ByteString) + -> Codec +mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) + + +{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated + text stream ends by returning the original bytestream beginning at the point of failure, + or the empty bytestring for a well-encoded text. + -} + +utf8 :: Codec +utf8 = mkCodec decodeUtf8 TE.encodeUtf8 + +utf8Pure :: Codec +utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 + +utf16LE :: Codec +utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE + +utf16BE :: Codec +utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE + +utf32LE :: Codec +utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE + +utf32BE :: Codec +utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE + + +{- | ascii and latin encodings only use a small number of the characters 'Text' + recognizes; thus we cannot use the pipes 'Lens' style to work with them. + Rather we simply define functions each way. + + 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, + returning the rest of the 'Text' at the first non-ascii 'Char' +-} + +encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeAscii = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (chunk, p') -> + if T.null chunk + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, + returning the rest of the 'Text' upon hitting any non-latin 'Char' + -} +encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeIso8859_1 = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (txt, p') -> + if T.null txt + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the + unused 'ByteString' upon hitting an un-ascii byte. + -} +decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeAscii = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else let (safe, unsafe) = B.span (<= 0x7F) chunk + in do yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return (do yield unsafe + p') + +{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the + unused 'ByteString' upon hitting the rare un-latinizable byte. + -} +decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeIso8859_1 = go where + go p = do e <- lift (next p) + case e of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else do let (safe, unsafe) = B.span (<= 0xFF) chunk + yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return (do yield unsafe + p') + + + diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs new file mode 100644 index 0000000..3c9ac98 --- /dev/null +++ b/Pipes/Text/IO.hs @@ -0,0 +1,96 @@ +{-#LANGUAGE RankNTypes#-} + +module Pipes.Text.IO + ( stdin + , stdout + , fromHandle + , toHandle + , readFile + , writeFile + ) where + +import qualified System.IO as IO +import Control.Exception (throwIO, try) +import Foreign.C.Error (Errno(Errno), ePIPE) +import qualified GHC.IO.Exception as G +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Pipes +import qualified Pipes.Safe.Prelude as Safe +import qualified Pipes.Safe as Safe +import Pipes.Safe (MonadSafe(..), Base(..)) +import Prelude hiding (readFile, writeFile) + +-- | Stream text from 'stdin' +stdin :: MonadIO m => Producer Text m () +stdin = fromHandle IO.stdin +{-# INLINE stdin #-} + +{-| Convert a 'IO.Handle' into a text stream using a text size + determined by the good sense of the text library; note that this + is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ + but uses the system encoding and has other `Data.Text.IO` features +-} + +fromHandle :: MonadIO m => IO.Handle -> Producer Text m () +fromHandle h = go where + go = do txt <- liftIO (T.hGetChunk h) + if T.null txt then return () + else do yield txt + go +{-# INLINABLE fromHandle#-} + + +{-| Stream text from a file in the simple fashion of @Data.Text.IO@ + +>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout +MAIN = PUTSTRLN "HELLO WORLD" +-} + +readFile :: MonadSafe m => FilePath -> Producer Text m () +readFile file = Safe.withFile file IO.ReadMode fromHandle +{-# INLINE readFile #-} + + +{-| Stream text to 'stdout' + + Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. + + Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ + instead of @(source >-> stdout)@ . +-} +stdout :: MonadIO m => Consumer' Text m () +stdout = go + where + go = do + txt <- await + x <- liftIO $ try (T.putStr txt) + case x of + Left (G.IOError { G.ioe_type = G.ResourceVanished + , G.ioe_errno = Just ioe }) + | Errno ioe == ePIPE + -> return () + Left e -> liftIO (throwIO e) + Right () -> go +{-# INLINABLE stdout #-} + + +{-| Convert a text stream into a 'Handle' + + Note: again, for best performance, where possible use + @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@. +-} +toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r +toHandle h = for cat (liftIO . T.hPutStr h) +{-# INLINABLE toHandle #-} + +{-# RULES "p >-> toHandle h" forall p h . + p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt)) + #-} + + +-- | Stream text into a file. Uses @pipes-safe@. +writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () +writeFile file = Safe.withFile file IO.WriteMode toHandle +{-# INLINE writeFile #-} diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs deleted file mode 100644 index 582ef14..0000000 --- a/Pipes/Text/Internal.hs +++ /dev/null @@ -1,7 +0,0 @@ -module Pipes.Text.Internal - (module Pipes.Text.Internal.Codec - , module Pipes.Text.Internal.Decoding - ) where - -import Pipes.Text.Internal.Codec -import Pipes.Text.Internal.Decoding \ No newline at end of file diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs deleted file mode 100644 index 075a152..0000000 --- a/Pipes/Text/Internal/Codec.hs +++ /dev/null @@ -1,216 +0,0 @@ - -{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} --- | --- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin --- License: MIT --- This Parts of this code were taken from enumerator and conduits, and adapted for pipes - --- This module follows the model of the enumerator and conduits libraries, and defines --- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and --- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so --- special functions appear in @Pipes.Text@ - - -module Pipes.Text.Internal.Codec - ( Codec(..) - , TextException(..) - , utf8 - , utf16_le - , utf16_be - , utf32_le - , utf32_be - ) where - -import Data.Bits ((.&.)) -import Data.Char (ord) -import Data.ByteString as B -import Data.ByteString (ByteString) -import Data.ByteString.Internal as B -import Data.ByteString.Char8 as B8 -import Data.Text (Text) -import qualified Data.Text as T -import qualified Data.Text.Encoding as TE -import Data.Text.Encoding.Error () -import GHC.Word (Word8, Word32) -import qualified Data.Text.Array as A -import Data.Word (Word8, Word16) -import System.IO.Unsafe (unsafePerformIO) -import qualified Control.Exception as Exc -import Data.Bits ((.&.), (.|.), shiftL) -import Data.Typeable -import Control.Arrow (first) -import Data.Maybe (catMaybes) -import Pipes.Text.Internal.Decoding -import Pipes --- | A specific character encoding. - -data Codec = Codec - { codecName :: Text - , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) - , codecDecode :: ByteString -> Decoding - } - -instance Show Codec where - showsPrec d c = showParen (d > 10) $ - showString "Codec " . shows (codecName c) - -data TextException = DecodeException Codec Word8 - | EncodeException Codec Char - | LengthExceeded Int - | TextException Exc.SomeException - deriving (Show, Typeable) -instance Exc.Exception TextException - - -toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) - -> (ByteString -> Decoding) -toDecoding op = loop B.empty where - loop !extra bs0 = case op (B.append extra bs0) of - (txt, Right bs) -> Some txt bs (loop bs) - (txt, Left (_,bs)) -> Other txt bs --- To do: toDecoding should be inlined in each of the 'Codec' definitions --- or else Codec changed to the conduit/enumerator definition. We have --- altered it to use 'streamDecodeUtf8' - -splitSlowly :: (ByteString -> Text) - -> ByteString - -> (Text, Either (TextException, ByteString) ByteString) -splitSlowly dec bytes = valid where - valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) - splits 0 = [(B.empty, bytes)] - splits n = B.splitAt n bytes : splits (n - 1) - decFirst (a, b) = case tryEvaluate (dec a) of - Left _ -> Nothing - Right text -> let trouble = case tryEvaluate (dec b) of - Left exc -> Left (TextException exc, b) - Right _ -> Right B.empty - in Just (text, trouble) -- this case shouldn't occur, - -- since splitSlowly is only called - -- when parsing failed somewhere - -utf8 :: Codec -utf8 = Codec name enc (toDecoding dec) where - name = T.pack "UTF-8" - enc text = (TE.encodeUtf8 text, Nothing) - dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) - --- -- Whether the given byte is a continuation byte. --- isContinuation byte = byte .&. 0xC0 == 0x80 --- --- -- The number of continuation bytes needed by the given --- -- non-continuation byte. Returns -1 for an illegal UTF-8 --- -- non-continuation byte and the whole split quickly must fail so --- -- as the input is passed to TE.decodeUtf8, which will issue a --- -- suitable error. --- required x0 --- | x0 .&. 0x80 == 0x00 = 0 --- | x0 .&. 0xE0 == 0xC0 = 1 --- | x0 .&. 0xF0 == 0xE0 = 2 --- | x0 .&. 0xF8 == 0xF0 = 3 --- | otherwise = -1 --- --- splitQuickly bytes --- | B.null l || req == -1 = Nothing --- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) --- | otherwise = Just (TE.decodeUtf8 l', r') --- where --- (l, r) = B.spanEnd isContinuation bytes --- req = required (B.last l) --- l' = B.init l --- r' = B.cons (B.last l) r - - -utf16_le :: Codec -utf16_le = Codec name enc (toDecoding dec) where - name = T.pack "UTF-16-LE" - enc text = (TE.encodeUtf16LE text, Nothing) - dec bytes = case splitQuickly bytes of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf16LE bytes - - splitQuickly bytes = maybeDecode (loop 0) where - maxN = B.length bytes - - loop n | n == maxN = decodeAll - | (n + 1) == maxN = decodeTo n - loop n = let - req = utf16Required - (B.index bytes n) - (B.index bytes (n + 1)) - decodeMore = loop $! n + req - in if n + req > maxN - then decodeTo n - else decodeMore - - decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) - decodeAll = (TE.decodeUtf16LE bytes, B.empty) - -utf16_be :: Codec -utf16_be = Codec name enc (toDecoding dec) where - name = T.pack "UTF-16-BE" - enc text = (TE.encodeUtf16BE text, Nothing) - dec bytes = case splitQuickly bytes of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf16BE bytes - - splitQuickly bytes = maybeDecode (loop 0) where - maxN = B.length bytes - - loop n | n == maxN = decodeAll - | (n + 1) == maxN = decodeTo n - loop n = let - req = utf16Required - (B.index bytes (n + 1)) - (B.index bytes n) - decodeMore = loop $! n + req - in if n + req > maxN - then decodeTo n - else decodeMore - - decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) - decodeAll = (TE.decodeUtf16BE bytes, B.empty) - -utf16Required :: Word8 -> Word8 -> Int -utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where - x :: Word16 - x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 - - -utf32_le :: Codec -utf32_le = Codec name enc (toDecoding dec) where - name = T.pack "UTF-32-LE" - enc text = (TE.encodeUtf32LE text, Nothing) - dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf32LE bs - - -utf32_be :: Codec -utf32_be = Codec name enc (toDecoding dec) where - name = T.pack "UTF-32-BE" - enc text = (TE.encodeUtf32BE text, Nothing) - dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of - Just (text, extra) -> (text, Right extra) - Nothing -> splitSlowly TE.decodeUtf32BE bs - -utf32SplitBytes :: (ByteString -> Text) - -> ByteString - -> Maybe (Text, ByteString) -utf32SplitBytes dec bytes = split where - split = maybeDecode (dec toDecode, extra) - len = B.length bytes - lenExtra = mod len 4 - - lenToDecode = len - lenExtra - (toDecode, extra) = if lenExtra == 0 - then (bytes, B.empty) - else B.splitAt lenToDecode bytes - - -tryEvaluate :: a -> Either Exc.SomeException a -tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate - -maybeDecode :: (a, b) -> Maybe (a, b) -maybeDecode (a, b) = case tryEvaluate a of - Left _ -> Nothing - Right _ -> Just (a, b) diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs deleted file mode 100644 index b5d928a..0000000 --- a/Pipes/Text/Internal/Decoding.hs +++ /dev/null @@ -1,154 +0,0 @@ -{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} -{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} -{-# LANGUAGE DeriveDataTypeable, RankNTypes #-} - --- This module lifts assorted materials from Brian O'Sullivan's text package --- especially @Data.Text.Encoding@ in order to define a pipes-appropriate --- 'streamDecodeUtf8' - -module Pipes.Text.Internal.Decoding - ( Decoding(..) - , streamDecodeUtf8 - , decodeSomeUtf8 - ) where -import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) -import Control.Monad.ST (ST, runST) -import Data.Bits ((.&.)) -import Data.ByteString as B -import Data.ByteString (ByteString) -import Data.ByteString.Internal as B -import Data.ByteString.Char8 as B8 -import Data.Text (Text) -import qualified Data.Text as T -import qualified Data.Text.Encoding as TE -import Data.Text.Encoding.Error () -import Data.Text.Internal (Text, textP) -import Foreign.C.Types (CSize) -import Foreign.ForeignPtr (withForeignPtr) -import Foreign.Marshal.Utils (with) -import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) -import Foreign.Storable (Storable, peek, poke) -import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) -import GHC.Word (Word8, Word32) -import qualified Data.Text.Array as A -import Data.Word (Word8, Word16) -import System.IO.Unsafe (unsafePerformIO) -import qualified Control.Exception as Exc -import Data.Bits ((.&.), (.|.), shiftL) -import Data.Typeable -import Control.Arrow (first) -import Data.Maybe (catMaybes) -#include "pipes_text_cbits.h" - - - --- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding - -data Decoding = Some Text ByteString (ByteString -> Decoding) - -- Text, continuation and any undecoded fragment. - | Other Text ByteString - -- Text followed by an undecodable ByteString - -instance Show Decoding where - showsPrec d (Some t bs _) = showParen (d > prec) $ - showString "Some " . showsPrec prec' t . - showChar ' ' . showsPrec prec' bs . - showString " _" - where prec = 10; prec' = prec + 1 - showsPrec d (Other t bs) = showParen (d > prec) $ - showString "Other " . showsPrec prec' t . - showChar ' ' . showsPrec prec' bs . - showString " _" - where prec = 10; prec' = prec + 1 - -newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) -newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) - --- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. -streamDecodeUtf8 :: ByteString -> Decoding -streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 - where - decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding - decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = - runST $ do marray <- A.new (len+1) - unsafeIOToST (decodeChunkToBuffer marray) - where - decodeChunkToBuffer :: A.MArray s -> IO Decoding - decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with codepoint0 $ \codepointPtr -> - with state0 $ \statePtr -> - with nullPtr $ \curPtrPtr -> - do let end = ptr `plusPtr` (off + len) - curPtr = ptr `plusPtr` off - poke curPtrPtr curPtr - c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr - state <- peek statePtr - lastPtr <- peek curPtrPtr - codepoint <- peek codepointPtr - n <- peek destOffPtr - chunkText <- mkText dest n - let left = lastPtr `minusPtr` curPtr - remaining = B.drop left bs - accum = if T.null chunkText then B.append old remaining else remaining - return $! case state of - UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error - _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) - {-# INLINE decodeChunkToBuffer #-} - {-# INLINE decodeChunkUtf8 #-} -{-# INLINE streamDecodeUtf8 #-} - --- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble -decodeSomeUtf8 :: ByteString -> (Text, ByteString) -decodeSomeUtf8 bs@(PS fp off len) = runST $ do - dest <- A.new (len+1) - unsafeIOToST $ - withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with (0::CodePoint) $ \codepointPtr -> - with (0::DecoderState) $ \statePtr -> - with nullPtr $ \curPtrPtr -> - do let end = ptr `plusPtr` (off + len) - curPtr = ptr `plusPtr` off - poke curPtrPtr curPtr - c_decode_utf8_with_state (A.maBA dest) destOffPtr - curPtrPtr end codepointPtr statePtr - state <- peek statePtr - lastPtr <- peek curPtrPtr - codepoint <- peek codepointPtr - n <- peek destOffPtr - chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest - return $! textP arr 0 (fromIntegral n) - let left = lastPtr `minusPtr` curPtr - remaining = B.drop left bs - return $! (chunkText, remaining) -{-# INLINE decodeSomeUtf8 #-} - -mkText :: A.MArray s -> CSize -> IO Text -mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest - return $! textP arr 0 (fromIntegral n) -{-# INLINE mkText #-} - -ord :: Char -> Int -ord (C# c#) = I# (ord# c#) -{-# INLINE ord #-} - -unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int -unsafeWrite marr i c - | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) - return 1 - | otherwise = do A.unsafeWrite marr i lo - A.unsafeWrite marr (i+1) hi - return 2 - where n = ord c - m = n - 0x10000 - lo = fromIntegral $ (m `shiftR` 10) + 0xD800 - hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 - shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) - {-# INLINE shiftR #-} -{-# INLINE unsafeWrite #-} - -foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state - :: MutableByteArray# s -> Ptr CSize - -> Ptr (Ptr Word8) -> Ptr Word8 - -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file -- cgit v1.2.3