Use clunky Data.Text.IO when bytestring is not explicit

author michaelt <what_is_it_to_do_anything@yahoo.com>

Wed, 15 Jan 2014 03:05:12 +0000 (22:05 -0500)

committer michaelt <what_is_it_to_do_anything@yahoo.com>

Wed, 15 Jan 2014 03:05:12 +0000 (22:05 -0500)
author michaelt <what_is_it_to_do_anything@yahoo.com>
Wed, 15 Jan 2014 03:05:12 +0000 (22:05 -0500)
committer michaelt <what_is_it_to_do_anything@yahoo.com>
Wed, 15 Jan 2014 03:05:12 +0000 (22:05 -0500)
diff --git a/Pipes/Text.hs b/Pipes/Text.hs

index 74d2023a7265552404cf4ee656e2559084362f66..cf493e9ab27b90d3c52d46f8296024d7ed8225c7 100644 (file)
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE RankNTypes, TypeFamilies #-}
+{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-}
  
  {-| This module provides @pipes@ utilities for \"text streams\", which are
      streams of 'Text' chunks.  The individual chunks are uniformly @strict@, but 
@@ -206,30 +206,36 @@ import Prelude hiding (
  -- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's
  fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()
  fromLazy  = foldrChunks (\e a -> yield e >> a) (return ()) 
-{-# INLINABLE fromLazy #-}
+{-# INLINE fromLazy #-}
  
  -- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
+stdin :: MonadIO m => Producer Text m ()
  stdin = fromHandle IO.stdin
-{-# INLINABLE stdin #-}
+{-# INLINE stdin #-}
  
  {-| Convert a 'IO.Handle' into a text stream using a text size 
-    determined by the good sense of the text library. 
+    determined by the good sense of the text library; note that this
+    is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
+    but uses the system encoding and has other `Data.Text.IO` features
  -}
  
-fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
-fromHandle h = decodeUtf8 (PB.fromHandle h)
-{-# INLINE fromHandle#-}
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
+fromHandle h =  go where
+      go = do txt <- liftIO (T.hGetChunk h)
+              unless (T.null txt) $ do yield txt
+                                       go
+{-# INLINABLE fromHandle#-}
  
-{-| Stream text from a file using Pipes.Safe
+
+{-| Stream text from a file in the simple fashion of @Data.Text.IO@ 
  
  >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
  MAIN = PUTSTRLN "HELLO WORLD"
  -}
  
-readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
+readFile :: MonadSafe m => FilePath -> Producer Text m ()
  readFile file = Safe.withFile file IO.ReadMode fromHandle
-{-# INLINABLE readFile #-}
+{-# INLINE readFile #-}
  
  {-| Stream lines of text from stdin (for testing in ghci etc.) 
  
@@ -249,7 +255,7 @@ stdinLn = go where
              txt <- liftIO (T.hGetLine IO.stdin)
              yield txt
              go
-
+{-# INLINABLE stdinLn #-}
  
  {-| Stream text to 'stdout'
  
@@ -305,6 +311,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
  -- | Stream text into a file. Uses @pipes-safe@.
  writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
  writeFile file = Safe.withFile file IO.WriteMode toHandle
+{-# INLINE writeFile #-}
  
  -- | Apply a transformation to each 'Char' in the stream
  map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r
@@ -592,10 +599,10 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
  
  decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
  decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
-  go carry dec0 p = do 
+  go !carry dec0 p = do 
       x <- lift (next p) 
       case x of Left r -> if B.null carry 
-                           then return (return r)      -- all input was consumed
+                           then return (return r)      -- all bytestrinput was consumed
                             else return (do yield carry -- a potentially valid fragment remains
                                             return r)
                                             
@@ -605,6 +612,8 @@ decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
                     PE.Other text bs -> do yield text 
                                            return (do yield bs -- an invalid blob remains
                                                       p')
+{-# INLINABLE decodeUtf8 #-}
+
  
  -- | Splits a 'Producer' after the given number of characters
  splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs

index 73d6fa487e4b4fb4491c7f680fd4dbc0f91b037c..7e5b04491dba0c7fa2638a400f5b34eb076e1057 100644 (file)
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -6,36 +6,25 @@
  module Pipes.Text.Internal 
      ( Decoding(..)
      , streamDecodeUtf8
+    , decodeSomeUtf8
      ) where
-
-import Control.Exception (evaluate, try)
-#if __GLASGOW_HASKELL__ >= 702
  import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
  import Control.Monad.ST (ST, runST)
-#else
-import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
-#endif
  import Data.Bits ((.&.))
-import Data.ByteString as B
-import Data.ByteString.Internal as B
-import Data.Text ()
-import qualified Data.Text as T
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Internal (Text(..), safe, textP)
-import Data.Word (Word8, Word32)
+import Data.ByteString as B 
+import Data.ByteString.Internal as B 
+import qualified Data.Text as T (null)
+import Data.Text.Encoding.Error ()
+import Data.Text.Internal (Text, textP)
  import Foreign.C.Types (CSize)
  import Foreign.ForeignPtr (withForeignPtr)
  import Foreign.Marshal.Utils (with)
  import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
  import Foreign.Storable (Storable, peek, poke)
-import GHC.Base hiding (ord)
-import GHC.Word
+import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
+import GHC.Word (Word8, Word32)
  import qualified Data.Text.Array as A
-import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
-import GHC.Word (Word8(..), Word16(..), Word32(..))
-
-import Data.Text.Unsafe (unsafeDupablePerformIO)
-
+    
  #include "pipes_text_cbits.h"
  
  -- | A stream oriented decoding result.
@@ -52,44 +41,102 @@ instance Show Decoding where
                                  showChar ' ' . showsPrec prec' bs .
                                  showString " _"
        where prec = 10; prec' = prec + 1
-      
+
  newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
  newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
  
  streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = decodeChunk B.empty 0 0
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
+  where
+  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
+                    runST $ do marray <- A.new (len+1) 
+                               unsafeIOToST (decodeChunkToBuffer marray)
+     where
+     decodeChunkToBuffer :: A.MArray s -> IO Decoding
+     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+       with (0::CSize) $ \destOffPtr ->
+       with codepoint0 $ \codepointPtr ->
+       with state0     $ \statePtr ->
+       with nullPtr    $ \curPtrPtr ->
+         do let end = ptr `plusPtr` (off + len)
+                curPtr = ptr `plusPtr` off
+            poke curPtrPtr curPtr
+            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+            state <- peek statePtr
+            lastPtr <- peek curPtrPtr
+            codepoint <- peek codepointPtr
+            n <- peek destOffPtr
+            chunkText <- mkText dest n
+            let left      = lastPtr `minusPtr` curPtr
+                remaining = B.drop left bs
+                accum = if T.null chunkText then B.append old remaining  else remaining 
+            return $! case state of 
+              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
+     {-# INLINE decodeChunkToBuffer #-}
+  {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
  
-decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
-decodeChunk old codepoint0 state0 bs@(PS fp off len) = 
-                  runST $ do marray <- A.new (len+1) 
-                             unsafeIOToST (decodeChunkToBuffer marray)
-   where
-   decodeChunkToBuffer :: A.MArray s -> IO Decoding
-   decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-     with (0::CSize) $ \destOffPtr ->
-     with codepoint0 $ \codepointPtr ->
-     with state0     $ \statePtr ->
-     with nullPtr    $ \curPtrPtr ->
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
+  dest <- A.new (len+1) 
+  unsafeIOToST $ 
+     withForeignPtr fp $ \ptr ->
+     with (0::CSize)        $ \destOffPtr ->
+     with (0::CodePoint)    $ \codepointPtr ->
+     with (0::DecoderState) $ \statePtr ->
+     with nullPtr           $ \curPtrPtr ->
         do let end = ptr `plusPtr` (off + len)
                curPtr = ptr `plusPtr` off
            poke curPtrPtr curPtr
-          c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
+                                   curPtrPtr end codepointPtr statePtr
            state <- peek statePtr
            lastPtr <- peek curPtrPtr
            codepoint <- peek codepointPtr
            n <- peek destOffPtr
-          chunkText <- mkText dest n
+          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                         return $! textP arr 0 (fromIntegral n)
            let left      = lastPtr `minusPtr` curPtr
                remaining = B.drop left bs
-              accum = if T.null chunkText then B.append old remaining  else remaining 
-          return $ case state of 
-            UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
-            _ ->           Some  chunkText accum (decodeChunk accum codepoint state)
+          return $! (chunkText, remaining)
+{-# INLINE decodeSomeUtf8 #-}
+
+-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+-- decodeSomeUtf8 bs@(PS fp off len) = 
+--                   runST $ do marray <- A.new (len+1) 
+--                              unsafeIOToST (decodeChunkToBuffer marray)
+--   
+--      where
+--      decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
+--      decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+--        with (0::CSize)        $ \destOffPtr ->
+--        with (0::CodePoint)    $ \codepointPtr ->
+--        with (0::DecoderState) $ \statePtr ->
+--        with nullPtr           $ \curPtrPtr ->
+--          do let end = ptr `plusPtr` (off + len)
+--                 curPtr = ptr `plusPtr` off
+--             poke curPtrPtr curPtr
+--             c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+--             state <- peek statePtr
+--             lastPtr <- peek curPtrPtr
+--             codepoint <- peek codepointPtr
+--             n <- peek destOffPtr
+--             chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+--                                            return $! textP arr 0 (fromIntegral n)
+--             let left      = lastPtr `minusPtr` curPtr
+--                 remaining = B.drop left bs
+--             return $! (chunkText, remaining)
+--      {-# INLINE decodeChunkToBuffer #-}
+-- {-# INLINE decodeSomeUtf8 #-}
+
  
  
  mkText :: A.MArray s -> CSize -> IO Text
  mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
                                     return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
  
  ord :: Char -> Int
  ord (C# c#) = I# (ord# c#)
@@ -107,6 +154,7 @@ unsafeWrite marr i c
            lo = fromIntegral $ (m `shiftR` 10) + 0xD800
            hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
            shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+          {-# INLINE shiftR #-}
  {-# INLINE unsafeWrite #-}
  
  foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
diff --git a/bench/IO.hs b/bench/IO.hs

new file mode 100644 (file)

index 0000000..b3a52f6
--- /dev/null
+++ b/bench/IO.hs
@@ -0,0 +1,20 @@
+import qualified Data.Text.IO as T
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.IO as TL
+import qualified Data.Text.Lazy as TL
+
+import Pipes
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP
+import Pipes.Safe
+
+main = textaction
+big = "../../examples/txt/words2.txt"
+
+textaction = T.readFile big >>= T.putStrLn
+pipeaction =  runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
+
+
+
+
+
diff --git a/cbits/cbits.c b/cbits/cbits.c

index e0fdfd5a11a622ef68fd7c7106bbcf0a336ea13d..c11645b3bb067ca6158eeb98f2e101f34a3bf589 100644 (file)
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
   *      state0 != UTF8_ACCEPT, UTF8_REJECT
   *
   */
-const uint8_t *
-_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
-                           const uint8_t **const src,
-                           const uint8_t *const srcend,
-                           uint32_t *codepoint0, uint32_t *state0)
+
+ #if defined(__GNUC__) || defined(__clang__)
+ static inline uint8_t const *
+ _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                        const uint8_t const **src, const uint8_t const *srcend,
+                        uint32_t *codepoint0, uint32_t *state0)
+   __attribute((always_inline));
+ #endif
+
+static inline uint8_t const *
+_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+                        const uint8_t const **src, const uint8_t const *srcend,
+                        uint32_t *codepoint0, uint32_t *state0)
  {
-  uint16_t *d = dest + *destoff;
-  const uint8_t *s = *src, *last = *src;
-  uint32_t state = *state0;
-  uint32_t codepoint = *codepoint0;
+ uint16_t *d = dest + *destoff;
+ const uint8_t *s = *src, *last = *src;
+ uint32_t state = *state0;
+ uint32_t codepoint = *codepoint0;
  
-  while (s < srcend) {
+ while (s < srcend) {
  #if defined(__i386__) || defined(__x86_64__)
-    /*
-     * This code will only work on a little-endian system that
-     * supports unaligned loads.
-     *
-     * It gives a substantial speed win on data that is purely or
-     * partly ASCII (e.g. HTML), at only a slight cost on purely
-     * non-ASCII text.
-     */
-
-    if (state == UTF8_ACCEPT) {
-      while (s < srcend - 4) {
+   /*
+    * This code will only work on a little-endian system that
+    * supports unaligned loads.
+    *
+    * It gives a substantial speed win on data that is purely or
+    * partly ASCII (e.g. HTML), at only a slight cost on purely
+    * non-ASCII text.
+    */
+
+   if (state == UTF8_ACCEPT) {
+     while (s < srcend - 4) {
         codepoint = *((uint32_t *) s);
         if ((codepoint & 0x80808080) != 0)
           break;
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
         *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
         *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
         *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-      }
-      last = s;
-    }
+     }
+     last = s;
+   }
  #endif
  
-    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
-      if (state != UTF8_REJECT)
+   if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+     if (state != UTF8_REJECT)
         continue;
-      break;
-    }
-
-    if (codepoint <= 0xffff)
-      *d++ = (uint16_t) codepoint;
-    else {
-      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
-    }
-    last = s;
-  }
-
-  /* Invalid encoding, back up to the errant character */
-  if (state == UTF8_REJECT)
-    s -= 1;
-
-  *destoff = d - dest;
-  *codepoint0 = codepoint;
-  *state0 = state;
-  *src = last;
-
-  return s;
+     break;
+   }
+
+   if (codepoint <= 0xffff)
+     *d++ = (uint16_t) codepoint;
+   else {
+     *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+     *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+   }
+   last = s;
+ }
+
+ *destoff = d - dest;
+ *codepoint0 = codepoint;
+ *state0 = state;
+ *src = last;
+
+ return s;
+}
+
+uint8_t const *
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                          const uint8_t const **src,
+                          const uint8_t const *srcend,
+                          uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
+                                               codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+   ret -=1;
+ return ret;
  }
  
diff --git a/pipes-text.cabal b/pipes-text.cabal

index 86fbab821593bcfcbd1a259ba815e280317ec753..b4388bef3b4c639841a2f9c81be5e1a2d48ef187 100644 (file)
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -25,6 +25,9 @@ library
                         pipes-bytestring >= 1.0 && < 1.2,
                         transformers >= 0.3     && < 0.4,
                         text >=0.11             && < 0.12,
-                       bytestring >=0.10       && < 0.11
+                       bytestring >=0.10       && < 0.11,
+                       vector,
+                       void
    -- hs-source-dirs:      
-  default-language:    Haskell2010
-\ No newline at end of file
+  default-language:    Haskell2010
+  ghc-options: -O2 
diff --git a/test/Test.hs b/test/Test.hs

index f2bf17b43c09a8651bbbfd91b3ee910845ec7c37..373bafb861f99e7bed7390e4192df9eef369bb7c 100644 (file)
--- a/test/Test.hs
+++ b/test/Test.hs
@@ -27,11 +27,11 @@ import qualified Pipes as P
  main :: IO ()
  main = defaultMain [tests]
  -- >>> :main  -a 10000
-
  tests = testGroup "stream_decode" [
    -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
    testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
-  testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
+  testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
+  testProperty "t_utf8_dec_some" t_utf8_dec_some]
  
  t_utf8_incr_valid  = do
          Positive n <- arbitrary
@@ -82,6 +82,19 @@ t_utf8_incr_pipe  = do
      chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
      appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
  
+--
+t_utf8_dec_some = do    
+       Positive  m <- arbitrary
+       txt         <- genUnicode
+       let bytesLength = mod 10 m :: Int
+       forAll (vector bytesLength) $ 
+              (roundtrip . appendBytes txt) 
+              `eq` 
+              appendBytes txt
+    where 
+    roundtrip bs = case PE.decodeSomeUtf8 bs of
+                        (txt,bys) -> E.encodeUtf8 txt <> bys
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
author	michaelt <what_is_it_to_do_anything@yahoo.com>
	Wed, 15 Jan 2014 03:05:12 +0000 (22:05 -0500)
committer	michaelt <what_is_it_to_do_anything@yahoo.com>
	Wed, 15 Jan 2014 03:05:12 +0000 (22:05 -0500)
Pipes/Text.hs		patch \| blob \| blame \| history
Pipes/Text/Internal.hs		patch \| blob \| blame \| history
bench/IO.hs	[new file with mode: 0644]	patch \| blob
cbits/cbits.c		patch \| blob \| blame \| history
pipes-text.cabal		patch \| blob \| blame \| history
test/Test.hs		patch \| blob \| blame \| history