6 files changed, 215 insertions, 105 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 74d2023..cf493e9 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE RankNTypes, TypeFamilies #-}
+{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-}
 {-| This module provides @pipes@ utilities for \"text streams\", which are
    streams of 'Text' chunks.  The individual chunks are uniformly @strict@, but 
@@ -206,30 +206,36 @@ import Prelude hiding (
 -- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's
 fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()
 fromLazy  = foldrChunks (\e a -> yield e >> a) (return ()) 
-{-# INLINABLE fromLazy #-}
+{-# INLINE fromLazy #-}
 -- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
+stdin :: MonadIO m => Producer Text m ()
 stdin = fromHandle IO.stdin
-{-# INLINABLE stdin #-}
+{-# INLINE stdin #-}
 {-| Convert a 'IO.Handle' into a text stream using a text size 
-    determined by the good sense of the text library. 
+    determined by the good sense of the text library; note that this
+    is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
+    but uses the system encoding and has other `Data.Text.IO` features
 -}
-fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
-fromHandle h = decodeUtf8 (PB.fromHandle h)
+fromHandle h =  go where
-{-# INLINE fromHandle#-}
+      go = do txt <- liftIO (T.hGetChunk h)
+              unless (T.null txt) $ do yield txt
+                                       go
+{-# INLINABLE fromHandle#-}
-{-| Stream text from a file using Pipes.Safe
+{-| Stream text from a file in the simple fashion of @Data.Text.IO@ 
 >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
 MAIN = PUTSTRLN "HELLO WORLD"
 -}
-readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
+readFile :: MonadSafe m => FilePath -> Producer Text m ()
 readFile file = Safe.withFile file IO.ReadMode fromHandle
-{-# INLINABLE readFile #-}
+{-# INLINE readFile #-}
 {-| Stream lines of text from stdin (for testing in ghci etc.) 
@@ -249,7 +255,7 @@ stdinLn = go where
            txt <- liftIO (T.hGetLine IO.stdin)
            yield txt
            go
+{-# INLINABLE stdinLn #-}
 {-| Stream text to 'stdout'
@@ -305,6 +311,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
 -- | Stream text into a file. Uses @pipes-safe@.
 writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
 writeFile file = Safe.withFile file IO.WriteMode toHandle
+{-# INLINE writeFile #-}
 -- | Apply a transformation to each 'Char' in the stream
 map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r
@@ -592,10 +599,10 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
 decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
 decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
-  go carry dec0 p = do 
+  go !carry dec0 p = do 
     x <- lift (next p) 
     case x of Left r -> if B.null carry 
-                           then return (return r)      -- all input was consumed
+                           then return (return r)      -- all bytestrinput was consumed
                           else return (do yield carry -- a potentially valid fragment remains
                                           return r)
                                           
@@ -605,6 +612,8 @@ decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
                   PE.Other text bs -> do yield text 
                                          return (do yield bs -- an invalid blob remains
                                                     p')
+{-# INLINABLE decodeUtf8 #-}
 -- | Splits a 'Producer' after the given number of characters
 splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 73d6fa4..7e5b044 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -6,36 +6,25 @@
 module Pipes.Text.Internal 
    ( Decoding(..)
    , streamDecodeUtf8
+    , decodeSomeUtf8
    ) where
-import Control.Exception (evaluate, try)
-#if __GLASGOW_HASKELL__ >= 702
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Control.Monad.ST (ST, runST)
-#else
-import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
-#endif
 import Data.Bits ((.&.))
-import Data.ByteString as B
+import Data.ByteString as B 
-import Data.ByteString.Internal as B
+import Data.ByteString.Internal as B 
-import Data.Text ()
+import qualified Data.Text as T (null)
-import qualified Data.Text as T
+import Data.Text.Encoding.Error ()
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Internal (Text, textP)
-import Data.Text.Internal (Text(..), safe, textP)
-import Data.Word (Word8, Word32)
 import Foreign.C.Types (CSize)
 import Foreign.ForeignPtr (withForeignPtr)
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
 import Foreign.Storable (Storable, peek, poke)
-import GHC.Base hiding (ord)
+import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
-import GHC.Word
+import GHC.Word (Word8, Word32)
 import qualified Data.Text.Array as A
-import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
+    
-import GHC.Word (Word8(..), Word16(..), Word32(..))
-import Data.Text.Unsafe (unsafeDupablePerformIO)
 #include "pipes_text_cbits.h"
 -- | A stream oriented decoding result.
@@ -52,44 +41,102 @@ instance Show Decoding where
                                showChar ' ' . showsPrec prec' bs .
                                showString " _"
      where prec = 10; prec' = prec + 1
-      
 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
 streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = decodeChunk B.empty 0 0
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
+  where
+  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
+                    runST $ do marray <- A.new (len+1) 
+                               unsafeIOToST (decodeChunkToBuffer marray)
+     where
+     decodeChunkToBuffer :: A.MArray s -> IO Decoding
+     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+       with (0::CSize) $ \destOffPtr ->
+       with codepoint0 $ \codepointPtr ->
+       with state0     $ \statePtr ->
+       with nullPtr    $ \curPtrPtr ->
+         do let end = ptr `plusPtr` (off + len)
+                curPtr = ptr `plusPtr` off
+            poke curPtrPtr curPtr
+            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+            state <- peek statePtr
+            lastPtr <- peek curPtrPtr
+            codepoint <- peek codepointPtr
+            n <- peek destOffPtr
+            chunkText <- mkText dest n
+            let left      = lastPtr `minusPtr` curPtr
+                remaining = B.drop left bs
+                accum = if T.null chunkText then B.append old remaining  else remaining 
+            return $! case state of 
+              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
+     {-# INLINE decodeChunkToBuffer #-}
+  {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
-decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
-decodeChunk old codepoint0 state0 bs@(PS fp off len) = 
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
-                  runST $ do marray <- A.new (len+1) 
+  dest <- A.new (len+1) 
-                             unsafeIOToST (decodeChunkToBuffer marray)
+  unsafeIOToST $ 
-   where
+     withForeignPtr fp $ \ptr ->
-   decodeChunkToBuffer :: A.MArray s -> IO Decoding
+     with (0::CSize)        $ \destOffPtr ->
-   decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+     with (0::CodePoint)    $ \codepointPtr ->
-     with (0::CSize) $ \destOffPtr ->
+     with (0::DecoderState) $ \statePtr ->
-     with codepoint0 $ \codepointPtr ->
+     with nullPtr           $ \curPtrPtr ->
-     with state0     $ \statePtr ->
-     with nullPtr    $ \curPtrPtr ->
       do let end = ptr `plusPtr` (off + len)
              curPtr = ptr `plusPtr` off
          poke curPtrPtr curPtr
-          c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
+                                   curPtrPtr end codepointPtr statePtr
          state <- peek statePtr
          lastPtr <- peek curPtrPtr
          codepoint <- peek codepointPtr
          n <- peek destOffPtr
-          chunkText <- mkText dest n
+          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                         return $! textP arr 0 (fromIntegral n)
          let left      = lastPtr `minusPtr` curPtr
              remaining = B.drop left bs
-              accum = if T.null chunkText then B.append old remaining  else remaining 
+          return $! (chunkText, remaining)
-          return $ case state of 
+{-# INLINE decodeSomeUtf8 #-}
-            UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
-            _ ->           Some  chunkText accum (decodeChunk accum codepoint state)
+-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+-- decodeSomeUtf8 bs@(PS fp off len) = 
+--                   runST $ do marray <- A.new (len+1) 
+--                              unsafeIOToST (decodeChunkToBuffer marray)
+--   
+--      where
+--      decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
+--      decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+--        with (0::CSize)        $ \destOffPtr ->
+--        with (0::CodePoint)    $ \codepointPtr ->
+--        with (0::DecoderState) $ \statePtr ->
+--        with nullPtr           $ \curPtrPtr ->
+--          do let end = ptr `plusPtr` (off + len)
+--                 curPtr = ptr `plusPtr` off
+--             poke curPtrPtr curPtr
+--             c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+--             state <- peek statePtr
+--             lastPtr <- peek curPtrPtr
+--             codepoint <- peek codepointPtr
+--             n <- peek destOffPtr
+--             chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+--                                            return $! textP arr 0 (fromIntegral n)
+--             let left      = lastPtr `minusPtr` curPtr
+--                 remaining = B.drop left bs
+--             return $! (chunkText, remaining)
+--      {-# INLINE decodeChunkToBuffer #-}
+-- {-# INLINE decodeSomeUtf8 #-}
 mkText :: A.MArray s -> CSize -> IO Text
 mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
                                   return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
 ord :: Char -> Int
 ord (C# c#) = I# (ord# c#)
@@ -107,6 +154,7 @@ unsafeWrite marr i c
          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+          {-# INLINE shiftR #-}
 {-# INLINE unsafeWrite #-}
 foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
diff --git a/bench/IO.hs b/bench/IO.hs
new file mode 100644
index 0000000..b3a52f6
--- /dev/null
+++ b/bench/IO.hs
@@ -0,0 +1,20 @@
+import qualified Data.Text.IO as T
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.IO as TL
+import qualified Data.Text.Lazy as TL
+import Pipes
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP
+import Pipes.Safe
+main = textaction
+big = "../../examples/txt/words2.txt"
+textaction = T.readFile big >>= T.putStrLn
+pipeaction =  runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
diff --git a/cbits/cbits.c b/cbits/cbits.c
index e0fdfd5..c11645b 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
 *      state0 != UTF8_ACCEPT, UTF8_REJECT
 *
 */
-const uint8_t *
-_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+ #if defined(__GNUC__) || defined(__clang__)
-                           const uint8_t **const src,
+ static inline uint8_t const *
-                           const uint8_t *const srcend,
+ _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
-                           uint32_t *codepoint0, uint32_t *state0)
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
+   __attribute((always_inline));
+ #endif
+static inline uint8_t const *
+_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                         const uint8_t const **src, const uint8_t const *srcend,
+                         uint32_t *codepoint0, uint32_t *state0)
 {
-  uint16_t *d = dest + *destoff;
+ uint16_t *d = dest + *destoff;
-  const uint8_t *s = *src, *last = *src;
+ const uint8_t *s = *src, *last = *src;
-  uint32_t state = *state0;
+ uint32_t state = *state0;
-  uint32_t codepoint = *codepoint0;
+ uint32_t codepoint = *codepoint0;
-  while (s < srcend) {
+ while (s < srcend) {
 #if defined(__i386__) || defined(__x86_64__)
-    /*
+   /*
-     * This code will only work on a little-endian system that
+    * This code will only work on a little-endian system that
-     * supports unaligned loads.
+    * supports unaligned loads.
-     *
+    *
-     * It gives a substantial speed win on data that is purely or
+    * It gives a substantial speed win on data that is purely or
-     * partly ASCII (e.g. HTML), at only a slight cost on purely
+    * partly ASCII (e.g. HTML), at only a slight cost on purely
-     * non-ASCII text.
+    * non-ASCII text.
-     */
+    */
-    if (state == UTF8_ACCEPT) {
+   if (state == UTF8_ACCEPT) {
-      while (s < srcend - 4) {
+     while (s < srcend - 4) {
        codepoint = *((uint32_t *) s);
        if ((codepoint & 0x80808080) != 0)
          break;
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-      }
+     }
-      last = s;
+     last = s;
-    }
+   }
 #endif
-    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+   if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
-      if (state != UTF8_REJECT)
+     if (state != UTF8_REJECT)
        continue;
-      break;
+     break;
-    }
+   }
-    if (codepoint <= 0xffff)
+   if (codepoint <= 0xffff)
-      *d++ = (uint16_t) codepoint;
+     *d++ = (uint16_t) codepoint;
-    else {
+   else {
-      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+     *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+     *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
-    }
+   }
-    last = s;
+   last = s;
-  }
+ }
-  /* Invalid encoding, back up to the errant character */
+ *destoff = d - dest;
-  if (state == UTF8_REJECT)
+ *codepoint0 = codepoint;
-    s -= 1;
+ *state0 = state;
+ *src = last;
-  *destoff = d - dest;
-  *codepoint0 = codepoint;
+ return s;
-  *state0 = state;
+}
-  *src = last;
+uint8_t const *
-  return s;
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                          const uint8_t const **src,
+                           const uint8_t const *srcend,
+                          uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
+                                                codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+   ret -=1;
+ return ret;
 }
diff --git a/pipes-text.cabal b/pipes-text.cabal
index 86fbab8..b4388be 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -25,6 +25,9 @@ library
                       pipes-bytestring >= 1.0 && < 1.2,
                       transformers >= 0.3     && < 0.4,
                       text >=0.11             && < 0.12,
-                       bytestring >=0.10       && < 0.11
+                       bytestring >=0.10       && < 0.11,
+                       vector,
+                       void
  -- hs-source-dirs:      
-  default-language:    Haskell2010
-\ No newline at end of file
+  default-language:    Haskell2010
+  ghc-options: -O2 
diff --git a/test/Test.hs b/test/Test.hs
index f2bf17b..373bafb 100644
--- a/test/Test.hs
+++ b/test/Test.hs
@@ -27,11 +27,11 @@ import qualified Pipes as P
 main :: IO ()
 main = defaultMain [tests]
 -- >>> :main  -a 10000
 tests = testGroup "stream_decode" [
  -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
  testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
-  testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
+  testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
+  testProperty "t_utf8_dec_some" t_utf8_dec_some]
 t_utf8_incr_valid  = do
        Positive n <- arbitrary
@@ -82,6 +82,19 @@ t_utf8_incr_pipe  = do
    chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+--
+t_utf8_dec_some = do    
+       Positive  m <- arbitrary
+       txt         <- genUnicode
+       let bytesLength = mod 10 m :: Int
+       forAll (vector bytesLength) $ 
+              (roundtrip . appendBytes txt) 
+              `eq` 
+              appendBytes txt
+    where 
+    roundtrip bs = case PE.decodeSomeUtf8 bs of
+                        (txt,bys) -> E.encodeUtf8 txt <> bys
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append

diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 74d2023..cf493e9 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
1	{-# LANGUAGE RankNTypes, TypeFamilies #-}	1	{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-}
2		2
3	{-\| This module provides @pipes@ utilities for \"text streams\", which are	3	{-\| This module provides @pipes@ utilities for \"text streams\", which are
4	streams of 'Text' chunks. The individual chunks are uniformly @strict@, but	4	streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
@@ -206,30 +206,36 @@ import Prelude hiding (
206	-- \| Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's	206	-- \| Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's
207	fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()	207	fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()
208	fromLazy = foldrChunks (\e a -> yield e >> a) (return ())	208	fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
209	{-# INLINABLE fromLazy #-}	209	{-# INLINE fromLazy #-}
210		210
211	-- \| Stream text from 'stdin'	211	-- \| Stream text from 'stdin'
212	stdin :: MonadIO m => Producer Text m (Producer ByteString m ())	212	stdin :: MonadIO m => Producer Text m ()
213	stdin = fromHandle IO.stdin	213	stdin = fromHandle IO.stdin
214	{-# INLINABLE stdin #-}	214	{-# INLINE stdin #-}
215		215
216	{-\| Convert a 'IO.Handle' into a text stream using a text size	216	{-\| Convert a 'IO.Handle' into a text stream using a text size
217	determined by the good sense of the text library.	217	determined by the good sense of the text library; note that this
		218	is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
		219	but uses the system encoding and has other `Data.Text.IO` features
218	-}	220	-}
219		221
220	fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())	222	fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
221	fromHandle h = decodeUtf8 (PB.fromHandle h)	223	fromHandle h = go where
222	{-# INLINE fromHandle#-}	224	go = do txt <- liftIO (T.hGetChunk h)
		225	unless (T.null txt) $ do yield txt
		226	go
		227	{-# INLINABLE fromHandle#-}
223		228
224	{-\| Stream text from a file using Pipes.Safe	229
		230	{-\| Stream text from a file in the simple fashion of @Data.Text.IO@
225		231
226	>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout	232	>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
227	MAIN = PUTSTRLN "HELLO WORLD"	233	MAIN = PUTSTRLN "HELLO WORLD"
228	-}	234	-}
229		235
230	readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())	236	readFile :: MonadSafe m => FilePath -> Producer Text m ()
231	readFile file = Safe.withFile file IO.ReadMode fromHandle	237	readFile file = Safe.withFile file IO.ReadMode fromHandle
232	{-# INLINABLE readFile #-}	238	{-# INLINE readFile #-}
233		239
234	{-\| Stream lines of text from stdin (for testing in ghci etc.)	240	{-\| Stream lines of text from stdin (for testing in ghci etc.)
235		241
@@ -249,7 +255,7 @@ stdinLn = go where
249	txt <- liftIO (T.hGetLine IO.stdin)	255	txt <- liftIO (T.hGetLine IO.stdin)
250	yield txt	256	yield txt
251	go	257	go
252		258	{-# INLINABLE stdinLn #-}
253		259
254	{-\| Stream text to 'stdout'	260	{-\| Stream text to 'stdout'
255		261
@@ -305,6 +311,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
305	-- \| Stream text into a file. Uses @pipes-safe@.	311	-- \| Stream text into a file. Uses @pipes-safe@.
306	writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()	312	writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
307	writeFile file = Safe.withFile file IO.WriteMode toHandle	313	writeFile file = Safe.withFile file IO.WriteMode toHandle
		314	{-# INLINE writeFile #-}
308		315
309	-- \| Apply a transformation to each 'Char' in the stream	316	-- \| Apply a transformation to each 'Char' in the stream
310	map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r	317	map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r
@@ -592,10 +599,10 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
592		599
593	decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)	600	decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
594	decodeUtf8 = go B.empty PE.streamDecodeUtf8 where	601	decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
595	go carry dec0 p = do	602	go !carry dec0 p = do
596	x <- lift (next p)	603	x <- lift (next p)
597	case x of Left r -> if B.null carry	604	case x of Left r -> if B.null carry
598	then return (return r) -- all input was consumed	605	then return (return r) -- all bytestrinput was consumed
599	else return (do yield carry -- a potentially valid fragment remains	606	else return (do yield carry -- a potentially valid fragment remains
600	return r)	607	return r)
601		608
@@ -605,6 +612,8 @@ decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
605	PE.Other text bs -> do yield text	612	PE.Other text bs -> do yield text
606	return (do yield bs -- an invalid blob remains	613	return (do yield bs -- an invalid blob remains
607	p')	614	p')
		615	{-# INLINABLE decodeUtf8 #-}
		616
608		617
609	-- \| Splits a 'Producer' after the given number of characters	618	-- \| Splits a 'Producer' after the given number of characters
610	splitAt	619	splitAt


diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 73d6fa4..7e5b044 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs
@@ -6,36 +6,25 @@
6	module Pipes.Text.Internal	6	module Pipes.Text.Internal
7	( Decoding(..)	7	( Decoding(..)
8	, streamDecodeUtf8	8	, streamDecodeUtf8
		9	, decodeSomeUtf8
9	) where	10	) where
10
11	import Control.Exception (evaluate, try)
12	#if __GLASGOW_HASKELL__ >= 702
13	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)	11	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
14	import Control.Monad.ST (ST, runST)	12	import Control.Monad.ST (ST, runST)
15	#else
16	import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
17	#endif
18	import Data.Bits ((.&.))	13	import Data.Bits ((.&.))
19	import Data.ByteString as B	14	import Data.ByteString as B
20	import Data.ByteString.Internal as B	15	import Data.ByteString.Internal as B
21	import Data.Text ()	16	import qualified Data.Text as T (null)
22	import qualified Data.Text as T	17	import Data.Text.Encoding.Error ()
23	import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)	18	import Data.Text.Internal (Text, textP)
24	import Data.Text.Internal (Text(..), safe, textP)
25	import Data.Word (Word8, Word32)
26	import Foreign.C.Types (CSize)	19	import Foreign.C.Types (CSize)
27	import Foreign.ForeignPtr (withForeignPtr)	20	import Foreign.ForeignPtr (withForeignPtr)
28	import Foreign.Marshal.Utils (with)	21	import Foreign.Marshal.Utils (with)
29	import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)	22	import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30	import Foreign.Storable (Storable, peek, poke)	23	import Foreign.Storable (Storable, peek, poke)
31	import GHC.Base hiding (ord)	24	import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
32	import GHC.Word	25	import GHC.Word (Word8, Word32)
33	import qualified Data.Text.Array as A	26	import qualified Data.Text.Array as A
34	import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)	27
35	import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37	import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39	#include "pipes_text_cbits.h"	28	#include "pipes_text_cbits.h"
40		29
41	-- \| A stream oriented decoding result.	30	-- \| A stream oriented decoding result.
@@ -52,44 +41,102 @@ instance Show Decoding where
52	showChar ' ' . showsPrec prec' bs .	41	showChar ' ' . showsPrec prec' bs .
53	showString " _"	42	showString " _"
54	where prec = 10; prec' = prec + 1	43	where prec = 10; prec' = prec + 1
55		44
56	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)	45	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)	46	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58		47
59	streamDecodeUtf8 :: ByteString -> Decoding	48	streamDecodeUtf8 :: ByteString -> Decoding
60	streamDecodeUtf8 = decodeChunk B.empty 0 0	49	streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
		50	where
		51	decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
		52	decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
		53	runST $ do marray <- A.new (len+1)
		54	unsafeIOToST (decodeChunkToBuffer marray)
		55	where
		56	decodeChunkToBuffer :: A.MArray s -> IO Decoding
		57	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
		58	with (0::CSize) $ \destOffPtr ->
		59	with codepoint0 $ \codepointPtr ->
		60	with state0 $ \statePtr ->
		61	with nullPtr $ \curPtrPtr ->
		62	do let end = ptr `plusPtr` (off + len)
		63	curPtr = ptr `plusPtr` off
		64	poke curPtrPtr curPtr
		65	c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
		66	state <- peek statePtr
		67	lastPtr <- peek curPtrPtr
		68	codepoint <- peek codepointPtr
		69	n <- peek destOffPtr
		70	chunkText <- mkText dest n
		71	let left = lastPtr `minusPtr` curPtr
		72	remaining = B.drop left bs
		73	accum = if T.null chunkText then B.append old remaining else remaining
		74	return $! case state of
		75	UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
		76	_ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
		77	{-# INLINE decodeChunkToBuffer #-}
		78	{-# INLINE decodeChunkUtf8 #-}
		79	{-# INLINE streamDecodeUtf8 #-}
61		80
62	decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding	81	decodeSomeUtf8 :: ByteString -> (Text, ByteString)
63	decodeChunk old codepoint0 state0 bs@(PS fp off len) =	82	decodeSomeUtf8 bs@(PS fp off len) = runST $ do
64	runST $ do marray <- A.new (len+1)	83	dest <- A.new (len+1)
65	unsafeIOToST (decodeChunkToBuffer marray)	84	unsafeIOToST $
66	where	85	withForeignPtr fp $ \ptr ->
67	decodeChunkToBuffer :: A.MArray s -> IO Decoding	86	with (0::CSize) $ \destOffPtr ->
68	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->	87	with (0::CodePoint) $ \codepointPtr ->
69	with (0::CSize) $ \destOffPtr ->	88	with (0::DecoderState) $ \statePtr ->
70	with codepoint0 $ \codepointPtr ->	89	with nullPtr $ \curPtrPtr ->
71	with state0 $ \statePtr ->
72	with nullPtr $ \curPtrPtr ->
73	do let end = ptr `plusPtr` (off + len)	90	do let end = ptr `plusPtr` (off + len)
74	curPtr = ptr `plusPtr` off	91	curPtr = ptr `plusPtr` off
75	poke curPtrPtr curPtr	92	poke curPtrPtr curPtr
76	c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr	93	c_decode_utf8_with_state (A.maBA dest) destOffPtr
		94	curPtrPtr end codepointPtr statePtr
77	state <- peek statePtr	95	state <- peek statePtr
78	lastPtr <- peek curPtrPtr	96	lastPtr <- peek curPtrPtr
79	codepoint <- peek codepointPtr	97	codepoint <- peek codepointPtr
80	n <- peek destOffPtr	98	n <- peek destOffPtr
81	chunkText <- mkText dest n	99	chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		100	return $! textP arr 0 (fromIntegral n)
82	let left = lastPtr `minusPtr` curPtr	101	let left = lastPtr `minusPtr` curPtr
83	remaining = B.drop left bs	102	remaining = B.drop left bs
84	accum = if T.null chunkText then B.append old remaining else remaining	103	return $! (chunkText, remaining)
85	return $ case state of	104	{-# INLINE decodeSomeUtf8 #-}
86	UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error	105
87	_ -> Some chunkText accum (decodeChunk accum codepoint state)	106	-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
		107	-- decodeSomeUtf8 bs@(PS fp off len) =
		108	-- runST $ do marray <- A.new (len+1)
		109	-- unsafeIOToST (decodeChunkToBuffer marray)
		110	--
		111	-- where
		112	-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
		113	-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
		114	-- with (0::CSize) $ \destOffPtr ->
		115	-- with (0::CodePoint) $ \codepointPtr ->
		116	-- with (0::DecoderState) $ \statePtr ->
		117	-- with nullPtr $ \curPtrPtr ->
		118	-- do let end = ptr `plusPtr` (off + len)
		119	-- curPtr = ptr `plusPtr` off
		120	-- poke curPtrPtr curPtr
		121	-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
		122	-- state <- peek statePtr
		123	-- lastPtr <- peek curPtrPtr
		124	-- codepoint <- peek codepointPtr
		125	-- n <- peek destOffPtr
		126	-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		127	-- return $! textP arr 0 (fromIntegral n)
		128	-- let left = lastPtr `minusPtr` curPtr
		129	-- remaining = B.drop left bs
		130	-- return $! (chunkText, remaining)
		131	-- {-# INLINE decodeChunkToBuffer #-}
		132	-- {-# INLINE decodeSomeUtf8 #-}
		133
88		134
89		135
90	mkText :: A.MArray s -> CSize -> IO Text	136	mkText :: A.MArray s -> CSize -> IO Text
91	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest	137	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
92	return $! textP arr 0 (fromIntegral n)	138	return $! textP arr 0 (fromIntegral n)
		139	{-# INLINE mkText #-}
93		140
94	ord :: Char -> Int	141	ord :: Char -> Int
95	ord (C# c#) = I# (ord# c#)	142	ord (C# c#) = I# (ord# c#)
@@ -107,6 +154,7 @@ unsafeWrite marr i c
107	lo = fromIntegral $ (m `shiftR` 10) + 0xD800	154	lo = fromIntegral $ (m `shiftR` 10) + 0xD800
108	hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00	155	hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
109	shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)	156	shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
		157	{-# INLINE shiftR #-}
110	{-# INLINE unsafeWrite #-}	158	{-# INLINE unsafeWrite #-}
111		159
112	foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state	160	foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state


diff --git a/bench/IO.hs b/bench/IO.hs new file mode 100644 index 0000000..b3a52f6 --- /dev/null +++ b/bench/IO.hs
@@ -0,0 +1,20 @@
		1	import qualified Data.Text.IO as T
		2	import qualified Data.Text as T
		3	import qualified Data.Text.Lazy.IO as TL
		4	import qualified Data.Text.Lazy as TL
		5
		6	import Pipes
		7	import qualified Pipes.Text as TP
		8	import qualified Pipes.ByteString as BP
		9	import Pipes.Safe
		10
		11	main = textaction
		12	big = "../../examples/txt/words2.txt"
		13
		14	textaction = T.readFile big >>= T.putStrLn
		15	pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
		16
		17
		18
		19
		20


diff --git a/cbits/cbits.c b/cbits/cbits.c index e0fdfd5..c11645b 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c
@@ -79,30 +79,38 @@ decode(uint32_t state, uint32_t codep, uint32_t byte) {
79	* state0 != UTF8_ACCEPT, UTF8_REJECT	79	* state0 != UTF8_ACCEPT, UTF8_REJECT
80	*	80	*
81	*/	81	*/
82	const uint8_t *	82
83	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,	83	#if defined(__GNUC__) \|\| defined(__clang__)
84	const uint8_t **const src,	84	static inline uint8_t const *
85	const uint8_t *const srcend,	85	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
86	uint32_t codepoint0, uint32_t state0)	86	const uint8_t const *src, const uint8_t const srcend,
		87	uint32_t codepoint0, uint32_t state0)
		88	__attribute((always_inline));
		89	#endif
		90
		91	static inline uint8_t const *
		92	_hs_pipes_text_decode_utf8_int(uint16_t const dest, size_t destoff,
		93	const uint8_t const *src, const uint8_t const srcend,
		94	uint32_t codepoint0, uint32_t state0)
87	{	95	{
88	uint16_t d = dest + destoff;	96	uint16_t d = dest + destoff;
89	const uint8_t s = src, last = src;	97	const uint8_t s = src, last = src;
90	uint32_t state = *state0;	98	uint32_t state = *state0;
91	uint32_t codepoint = *codepoint0;	99	uint32_t codepoint = *codepoint0;
92		100
93	while (s < srcend) {	101	while (s < srcend) {
94	#if defined(__i386__) \|\| defined(__x86_64__)	102	#if defined(__i386__) \|\| defined(__x86_64__)
95	/*	103	/*
96	* This code will only work on a little-endian system that	104	* This code will only work on a little-endian system that
97	* supports unaligned loads.	105	* supports unaligned loads.
98	*	106	*
99	* It gives a substantial speed win on data that is purely or	107	* It gives a substantial speed win on data that is purely or
100	* partly ASCII (e.g. HTML), at only a slight cost on purely	108	* partly ASCII (e.g. HTML), at only a slight cost on purely
101	* non-ASCII text.	109	* non-ASCII text.
102	*/	110	*/
103		111
104	if (state == UTF8_ACCEPT) {	112	if (state == UTF8_ACCEPT) {
105	while (s < srcend - 4) {	113	while (s < srcend - 4) {
106	codepoint = ((uint32_t ) s);	114	codepoint = ((uint32_t ) s);
107	if ((codepoint & 0x80808080) != 0)	115	if ((codepoint & 0x80808080) != 0)
108	break;	116	break;
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
117	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);	125	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
118	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);	126	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
119	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);	127	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
120	}	128	}
121	last = s;	129	last = s;
122	}	130	}
123	#endif	131	#endif
124		132
125	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {	133	if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
126	if (state != UTF8_REJECT)	134	if (state != UTF8_REJECT)
127	continue;	135	continue;
128	break;	136	break;
129	}	137	}
130		138
131	if (codepoint <= 0xffff)	139	if (codepoint <= 0xffff)
132	*d++ = (uint16_t) codepoint;	140	*d++ = (uint16_t) codepoint;
133	else {	141	else {
134	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));	142	*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
135	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));	143	*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
136	}	144	}
137	last = s;	145	last = s;
138	}	146	}
139		147
140	/* Invalid encoding, back up to the errant character */	148	*destoff = d - dest;
141	if (state == UTF8_REJECT)	149	*codepoint0 = codepoint;
142	s -= 1;	150	*state0 = state;
143		151	*src = last;
144	*destoff = d - dest;	152
145	*codepoint0 = codepoint;	153	return s;
146	*state0 = state;	154	}
147	*src = last;	155
148		156	uint8_t const *
149	return s;	157	_hs_pipes_text_decode_utf8_state(uint16_t const dest, size_t destoff,
		158	const uint8_t const **src,
		159	const uint8_t const *srcend,
		160	uint32_t codepoint0, uint32_t state0)
		161	{
		162	uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
		163	codepoint0, state0);
		164	if (*state0 == UTF8_REJECT)
		165	ret -=1;
		166	return ret;
150	}	167	}
151		168


diff --git a/pipes-text.cabal b/pipes-text.cabal index 86fbab8..b4388be 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal
@@ -25,6 +25,9 @@ library
25	pipes-bytestring >= 1.0 && < 1.2,	25	pipes-bytestring >= 1.0 && < 1.2,
26	transformers >= 0.3 && < 0.4,	26	transformers >= 0.3 && < 0.4,
27	text >=0.11 && < 0.12,	27	text >=0.11 && < 0.12,
28	bytestring >=0.10 && < 0.11	28	bytestring >=0.10 && < 0.11,
		29	vector,
		30	void
29	-- hs-source-dirs:	31	-- hs-source-dirs:
30	default-language: Haskell2010 \ No newline at end of file	32	default-language: Haskell2010
		33	ghc-options: -O2


diff --git a/test/Test.hs b/test/Test.hs index f2bf17b..373bafb 100644 --- a/test/Test.hs +++ b/test/Test.hs
@@ -27,11 +27,11 @@ import qualified Pipes as P
27	main :: IO ()	27	main :: IO ()
28	main = defaultMain [tests]	28	main = defaultMain [tests]
29	-- >>> :main -a 10000	29	-- >>> :main -a 10000
30
31	tests = testGroup "stream_decode" [	30	tests = testGroup "stream_decode" [
32	-- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,	31	-- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
33	testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,	32	testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
34	testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]	33	testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
		34	testProperty "t_utf8_dec_some" t_utf8_dec_some]
35		35
36	t_utf8_incr_valid = do	36	t_utf8_incr_valid = do
37	Positive n <- arbitrary	37	Positive n <- arbitrary
@@ -82,6 +82,19 @@ t_utf8_incr_pipe = do
82	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b	82	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
83	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append	83	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
84		84
		85	--
		86	t_utf8_dec_some = do
		87	Positive m <- arbitrary
		88	txt <- genUnicode
		89	let bytesLength = mod 10 m :: Int
		90	forAll (vector bytesLength) $
		91	(roundtrip . appendBytes txt)
		92	`eq`
		93	appendBytes txt
		94	where
		95	roundtrip bs = case PE.decodeSomeUtf8 bs of
		96	(txt,bys) -> E.encodeUtf8 txt <> bys
		97	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
85		98
86		99
87		100