From 3694350ac7b9c42fd64e0092a74cf0471a080058 Mon Sep 17 00:00:00 2001 From: michaelt Date: Tue, 14 Jan 2014 22:05:12 -0500 Subject: [PATCH] Use clunky Data.Text.IO when bytestring is not explicit --- Pipes/Text.hs | 37 +++++++----- Pipes/Text/Internal.hs | 126 ++++++++++++++++++++++++++++------------- bench/IO.hs | 20 +++++++ cbits/cbits.c | 113 ++++++++++++++++++++---------------- pipes-text.cabal | 7 ++- test/Test.hs | 17 +++++- 6 files changed, 215 insertions(+), 105 deletions(-) create mode 100644 bench/IO.hs diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 74d2023..cf493e9 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE RankNTypes, TypeFamilies #-} +{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-} {-| This module provides @pipes@ utilities for \"text streams\", which are streams of 'Text' chunks. The individual chunks are uniformly @strict@, but @@ -206,30 +206,36 @@ import Prelude hiding ( -- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's fromLazy :: (Monad m) => TL.Text -> Producer' Text m () fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) -{-# INLINABLE fromLazy #-} +{-# INLINE fromLazy #-} -- | Stream text from 'stdin' -stdin :: MonadIO m => Producer Text m (Producer ByteString m ()) +stdin :: MonadIO m => Producer Text m () stdin = fromHandle IO.stdin -{-# INLINABLE stdin #-} +{-# INLINE stdin #-} {-| Convert a 'IO.Handle' into a text stream using a text size - determined by the good sense of the text library. + determined by the good sense of the text library; note that this + is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ + but uses the system encoding and has other `Data.Text.IO` features -} -fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ()) -fromHandle h = decodeUtf8 (PB.fromHandle h) -{-# INLINE fromHandle#-} +fromHandle :: MonadIO m => IO.Handle -> Producer Text m () +fromHandle h = go where + go = do txt <- liftIO (T.hGetChunk h) + unless (T.null txt) $ do yield txt + go +{-# INLINABLE fromHandle#-} -{-| Stream text from a file using Pipes.Safe + +{-| Stream text from a file in the simple fashion of @Data.Text.IO@ >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout MAIN = PUTSTRLN "HELLO WORLD" -} -readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ()) +readFile :: MonadSafe m => FilePath -> Producer Text m () readFile file = Safe.withFile file IO.ReadMode fromHandle -{-# INLINABLE readFile #-} +{-# INLINE readFile #-} {-| Stream lines of text from stdin (for testing in ghci etc.) @@ -249,7 +255,7 @@ stdinLn = go where txt <- liftIO (T.hGetLine IO.stdin) yield txt go - +{-# INLINABLE stdinLn #-} {-| Stream text to 'stdout' @@ -305,6 +311,7 @@ toHandle h = for cat (liftIO . T.hPutStr h) -- | Stream text into a file. Uses @pipes-safe@. writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () writeFile file = Safe.withFile file IO.WriteMode toHandle +{-# INLINE writeFile #-} -- | Apply a transformation to each 'Char' in the stream map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r @@ -592,10 +599,10 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) decodeUtf8 = go B.empty PE.streamDecodeUtf8 where - go carry dec0 p = do + go !carry dec0 p = do x <- lift (next p) case x of Left r -> if B.null carry - then return (return r) -- all input was consumed + then return (return r) -- all bytestrinput was consumed else return (do yield carry -- a potentially valid fragment remains return r) @@ -605,6 +612,8 @@ decodeUtf8 = go B.empty PE.streamDecodeUtf8 where PE.Other text bs -> do yield text return (do yield bs -- an invalid blob remains p') +{-# INLINABLE decodeUtf8 #-} + -- | Splits a 'Producer' after the given number of characters splitAt diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 73d6fa4..7e5b044 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs @@ -6,36 +6,25 @@ module Pipes.Text.Internal ( Decoding(..) , streamDecodeUtf8 + , decodeSomeUtf8 ) where - -import Control.Exception (evaluate, try) -#if __GLASGOW_HASKELL__ >= 702 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Monad.ST (ST, runST) -#else -import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) -#endif import Data.Bits ((.&.)) -import Data.ByteString as B -import Data.ByteString.Internal as B -import Data.Text () -import qualified Data.Text as T -import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) -import Data.Text.Internal (Text(..), safe, textP) -import Data.Word (Word8, Word32) +import Data.ByteString as B +import Data.ByteString.Internal as B +import qualified Data.Text as T (null) +import Data.Text.Encoding.Error () +import Data.Text.Internal (Text, textP) import Foreign.C.Types (CSize) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Utils (with) import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) import Foreign.Storable (Storable, peek, poke) -import GHC.Base hiding (ord) -import GHC.Word +import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) +import GHC.Word (Word8, Word32) import qualified Data.Text.Array as A -import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) -import GHC.Word (Word8(..), Word16(..), Word32(..)) - -import Data.Text.Unsafe (unsafeDupablePerformIO) - + #include "pipes_text_cbits.h" -- | A stream oriented decoding result. @@ -52,44 +41,102 @@ instance Show Decoding where showChar ' ' . showsPrec prec' bs . showString " _" where prec = 10; prec' = prec + 1 - + newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) streamDecodeUtf8 :: ByteString -> Decoding -streamDecodeUtf8 = decodeChunk B.empty 0 0 +streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 + where + decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding + decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = + runST $ do marray <- A.new (len+1) + unsafeIOToST (decodeChunkToBuffer marray) + where + decodeChunkToBuffer :: A.MArray s -> IO Decoding + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + with nullPtr $ \curPtrPtr -> + do let end = ptr `plusPtr` (off + len) + curPtr = ptr `plusPtr` off + poke curPtrPtr curPtr + c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr + state <- peek statePtr + lastPtr <- peek curPtrPtr + codepoint <- peek codepointPtr + n <- peek destOffPtr + chunkText <- mkText dest n + let left = lastPtr `minusPtr` curPtr + remaining = B.drop left bs + accum = if T.null chunkText then B.append old remaining else remaining + return $! case state of + UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error + _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) + {-# INLINE decodeChunkToBuffer #-} + {-# INLINE decodeChunkUtf8 #-} +{-# INLINE streamDecodeUtf8 #-} -decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding -decodeChunk old codepoint0 state0 bs@(PS fp off len) = - runST $ do marray <- A.new (len+1) - unsafeIOToST (decodeChunkToBuffer marray) - where - decodeChunkToBuffer :: A.MArray s -> IO Decoding - decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with codepoint0 $ \codepointPtr -> - with state0 $ \statePtr -> - with nullPtr $ \curPtrPtr -> +decodeSomeUtf8 :: ByteString -> (Text, ByteString) +decodeSomeUtf8 bs@(PS fp off len) = runST $ do + dest <- A.new (len+1) + unsafeIOToST $ + withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with (0::CodePoint) $ \codepointPtr -> + with (0::DecoderState) $ \statePtr -> + with nullPtr $ \curPtrPtr -> do let end = ptr `plusPtr` (off + len) curPtr = ptr `plusPtr` off poke curPtrPtr curPtr - c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr + c_decode_utf8_with_state (A.maBA dest) destOffPtr + curPtrPtr end codepointPtr statePtr state <- peek statePtr lastPtr <- peek curPtrPtr codepoint <- peek codepointPtr n <- peek destOffPtr - chunkText <- mkText dest n + chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) let left = lastPtr `minusPtr` curPtr remaining = B.drop left bs - accum = if T.null chunkText then B.append old remaining else remaining - return $ case state of - UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error - _ -> Some chunkText accum (decodeChunk accum codepoint state) + return $! (chunkText, remaining) +{-# INLINE decodeSomeUtf8 #-} + +-- decodeSomeUtf8 :: ByteString -> (Text, ByteString) +-- decodeSomeUtf8 bs@(PS fp off len) = +-- runST $ do marray <- A.new (len+1) +-- unsafeIOToST (decodeChunkToBuffer marray) +-- +-- where +-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) +-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> +-- with (0::CSize) $ \destOffPtr -> +-- with (0::CodePoint) $ \codepointPtr -> +-- with (0::DecoderState) $ \statePtr -> +-- with nullPtr $ \curPtrPtr -> +-- do let end = ptr `plusPtr` (off + len) +-- curPtr = ptr `plusPtr` off +-- poke curPtrPtr curPtr +-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr +-- state <- peek statePtr +-- lastPtr <- peek curPtrPtr +-- codepoint <- peek codepointPtr +-- n <- peek destOffPtr +-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest +-- return $! textP arr 0 (fromIntegral n) +-- let left = lastPtr `minusPtr` curPtr +-- remaining = B.drop left bs +-- return $! (chunkText, remaining) +-- {-# INLINE decodeChunkToBuffer #-} +-- {-# INLINE decodeSomeUtf8 #-} + mkText :: A.MArray s -> CSize -> IO Text mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest return $! textP arr 0 (fromIntegral n) +{-# INLINE mkText #-} ord :: Char -> Int ord (C# c#) = I# (ord# c#) @@ -107,6 +154,7 @@ unsafeWrite marr i c lo = fromIntegral $ (m `shiftR` 10) + 0xD800 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) + {-# INLINE shiftR #-} {-# INLINE unsafeWrite #-} foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state diff --git a/bench/IO.hs b/bench/IO.hs new file mode 100644 index 0000000..b3a52f6 --- /dev/null +++ b/bench/IO.hs @@ -0,0 +1,20 @@ +import qualified Data.Text.IO as T +import qualified Data.Text as T +import qualified Data.Text.Lazy.IO as TL +import qualified Data.Text.Lazy as TL + +import Pipes +import qualified Pipes.Text as TP +import qualified Pipes.ByteString as BP +import Pipes.Safe + +main = textaction +big = "../../examples/txt/words2.txt" + +textaction = T.readFile big >>= T.putStrLn +pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn) + + + + + diff --git a/cbits/cbits.c b/cbits/cbits.c index e0fdfd5..c11645b 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) { * state0 != UTF8_ACCEPT, UTF8_REJECT * */ -const uint8_t * -_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, - const uint8_t **const src, - const uint8_t *const srcend, - uint32_t *codepoint0, uint32_t *state0) + + #if defined(__GNUC__) || defined(__clang__) + static inline uint8_t const * + _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, + const uint8_t const **src, const uint8_t const *srcend, + uint32_t *codepoint0, uint32_t *state0) + __attribute((always_inline)); + #endif + +static inline uint8_t const * +_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, + const uint8_t const **src, const uint8_t const *srcend, + uint32_t *codepoint0, uint32_t *state0) { - uint16_t *d = dest + *destoff; - const uint8_t *s = *src, *last = *src; - uint32_t state = *state0; - uint32_t codepoint = *codepoint0; + uint16_t *d = dest + *destoff; + const uint8_t *s = *src, *last = *src; + uint32_t state = *state0; + uint32_t codepoint = *codepoint0; - while (s < srcend) { + while (s < srcend) { #if defined(__i386__) || defined(__x86_64__) - /* - * This code will only work on a little-endian system that - * supports unaligned loads. - * - * It gives a substantial speed win on data that is purely or - * partly ASCII (e.g. HTML), at only a slight cost on purely - * non-ASCII text. - */ - - if (state == UTF8_ACCEPT) { - while (s < srcend - 4) { + /* + * This code will only work on a little-endian system that + * supports unaligned loads. + * + * It gives a substantial speed win on data that is purely or + * partly ASCII (e.g. HTML), at only a slight cost on purely + * non-ASCII text. + */ + + if (state == UTF8_ACCEPT) { + while (s < srcend - 4) { codepoint = *((uint32_t *) s); if ((codepoint & 0x80808080) != 0) break; @@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, *d++ = (uint16_t) ((codepoint >> 8) & 0xff); *d++ = (uint16_t) ((codepoint >> 16) & 0xff); *d++ = (uint16_t) ((codepoint >> 24) & 0xff); - } - last = s; - } + } + last = s; + } #endif - if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { - if (state != UTF8_REJECT) + if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { + if (state != UTF8_REJECT) continue; - break; - } - - if (codepoint <= 0xffff) - *d++ = (uint16_t) codepoint; - else { - *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); - *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); - } - last = s; - } - - /* Invalid encoding, back up to the errant character */ - if (state == UTF8_REJECT) - s -= 1; - - *destoff = d - dest; - *codepoint0 = codepoint; - *state0 = state; - *src = last; - - return s; + break; + } + + if (codepoint <= 0xffff) + *d++ = (uint16_t) codepoint; + else { + *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); + *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); + } + last = s; + } + + *destoff = d - dest; + *codepoint0 = codepoint; + *state0 = state; + *src = last; + + return s; +} + +uint8_t const * +_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, + const uint8_t const **src, + const uint8_t const *srcend, + uint32_t *codepoint0, uint32_t *state0) +{ + uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, + codepoint0, state0); + if (*state0 == UTF8_REJECT) + ret -=1; + return ret; } diff --git a/pipes-text.cabal b/pipes-text.cabal index 86fbab8..b4388be 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal @@ -25,6 +25,9 @@ library pipes-bytestring >= 1.0 && < 1.2, transformers >= 0.3 && < 0.4, text >=0.11 && < 0.12, - bytestring >=0.10 && < 0.11 + bytestring >=0.10 && < 0.11, + vector, + void -- hs-source-dirs: - default-language: Haskell2010 \ No newline at end of file + default-language: Haskell2010 + ghc-options: -O2 diff --git a/test/Test.hs b/test/Test.hs index f2bf17b..373bafb 100644 --- a/test/Test.hs +++ b/test/Test.hs @@ -27,11 +27,11 @@ import qualified Pipes as P main :: IO () main = defaultMain [tests] -- >>> :main -a 10000 - tests = testGroup "stream_decode" [ -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed , - testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe] + testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe, + testProperty "t_utf8_dec_some" t_utf8_dec_some] t_utf8_incr_valid = do Positive n <- arbitrary @@ -82,6 +82,19 @@ t_utf8_incr_pipe = do chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append +-- +t_utf8_dec_some = do + Positive m <- arbitrary + txt <- genUnicode + let bytesLength = mod 10 m :: Int + forAll (vector bytesLength) $ + (roundtrip . appendBytes txt) + `eq` + appendBytes txt + where + roundtrip bs = case PE.decodeSomeUtf8 bs of + (txt,bys) -> E.encodeUtf8 txt <> bys + appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append -- 2.41.0