scrap character replacement; simplify

author michaelt <what_is_it_to_do_anything@yahoo.com>

Thu, 26 Dec 2013 03:25:07 +0000 (22:25 -0500)

committer michaelt <what_is_it_to_do_anything@yahoo.com>

Thu, 26 Dec 2013 03:25:07 +0000 (22:25 -0500)
author michaelt <what_is_it_to_do_anything@yahoo.com>
Thu, 26 Dec 2013 03:25:07 +0000 (22:25 -0500)
committer michaelt <what_is_it_to_do_anything@yahoo.com>
Thu, 26 Dec 2013 03:25:07 +0000 (22:25 -0500)
diff --git a/Pipes/Text.hs b/Pipes/Text.hs

index 6845dd3b533935ba54e03656008ac6923d17a69e..d62aee7af6eeb34d4651033223a56ccb0034c71a 100644 (file)
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-}
+{-# LANGUAGE RankNTypes, TypeFamilies, NoMonomorphismRestriction #-}
  
  {-| This module provides @pipes@ utilities for \"text streams\", which are
      streams of 'Text' chunks.  The individual chunks are uniformly @strict@, but 
@@ -116,7 +116,6 @@ module Pipes.Text  (
      lines,
      words,
      decodeUtf8,
-    decodeUtf8With,
      -- * Transformations
      intersperse,
      
@@ -209,7 +208,7 @@ fromLazy  = foldrChunks (\e a -> yield e >> a) (return ())
  {-# INLINABLE fromLazy #-}
  
  -- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
+stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
  stdin = fromHandle IO.stdin
  {-# INLINABLE stdin #-}
  
@@ -217,52 +216,17 @@ stdin = fromHandle IO.stdin
      determined by the good sense of the text library. 
  -}
  
-fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
--- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
--- if only so that mistakes can be concentrated in one place.
--- This modifies something that was faster on an earlier iteration.
--- Note also that the `text` replacement system is being ignored;
--- with a replacement scheme one could have `Producer Text m ()`
--- the relation to the replacement business needs to be thought out.
--- The complicated type seems overmuch for the toy stdin above
-fromHandle h = go PE.streamDecodeUtf8 B.empty where
-  act = B.hGetSome h defaultChunkSize
-  go dec old = do chunk <- liftIO act
-                  if B.null chunk 
-                    then if B.null old then return (return ())
-                                       else return (yield old >> return ())
-                    else case dec chunk of 
-                           PE.Some text bs dec' -> 
-                              if T.null text then go dec' (B.append old bs) 
-                                             else do yield text
-                                                     go dec' B.empty
-                           PE.Other text bs ->
-                              if T.null text then return (do yield old
-                                                             yield bs
-                                                             PB.fromHandle h)
-                                             else do yield text
-                                                     return (do yield bs
-                                                                PB.fromHandle h)
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
+fromHandle h = decodeUtf8 (PB.fromHandle h)
  {-# INLINE fromHandle#-}
--- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
--- the dedicated Text IO function 'hGetChunk' ;
--- this way "runEffect $ PT.fromHandle hIn  >->  PT.toHandle hOut"
--- runs the same as the conduit equivalent, only slightly slower 
--- than "runEffect $ PB.fromHandle hIn  >->  PB.toHandle hOut"
--- #else
--- fromHandle h = go where
---     go = do txt <- liftIO (T.hGetChunk h)
---             unless (T.null txt) $ do yield txt
---                                      go
--- {-# INLINABLE fromHandle#-}
--- #endif
+
  {-| Stream text from a file using Pipes.Safe
  
  >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
  MAIN = PUTSTRLN "HELLO WORLD"
  -}
  
-readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
+readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
  readFile file = Safe.withFile file IO.ReadMode fromHandle
  {-# INLINABLE readFile #-}
  
@@ -338,7 +302,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
  
  
  -- | Stream text into a file. Uses @pipes-safe@.
-writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m ()
+writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
  writeFile file = Safe.withFile file IO.WriteMode toHandle
  
  -- | Apply a transformation to each 'Char' in the stream
@@ -624,42 +588,18 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
  
  -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
  -- into a Pipe of Text
-decodeUtf8
-  :: Monad m
-  => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8 = decodeUtf8With Nothing
-{-# INLINEABLE decodeUtf8 #-}
-
--- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
--- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
--- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
-decodeUtf8With
-  :: Monad m  
-  => Maybe TE.OnDecodeError 
-  -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where 
-  go dec old p = do
-    x <- lift (next p)
-    case x of
-      Left r -> if B.null old then return (return r)
-                              else return (do yield old 
-                                              return r)
-      Right (chunk, p') -> 
-        case dec chunk of 
-          PE.Some text l dec' -> 
-            if T.null text then go dec' (B.append old l) p'
-                           else do yield text
-                                   go dec' B.empty p'
-          PE.Other text bs ->
-            if T.null text then return (do yield old 
-                                           yield bs
-                                           p')
-                           else do yield text
-                                   return (do yield bs
-                                              p')
-{-# INLINEABLE decodeUtf8With #-}
-
  
+decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8 = go PE.streamDecodeUtf8 where
+  go dec0 p = do 
+     x <- lift (next p)
+     case x of Left r -> return (return r)
+               Right (chunk, p') -> 
+                 case dec0 chunk of PE.Some text _ dec -> do yield text
+                                                             go dec p'
+                                    PE.Other text bs -> do yield text
+                                                           return (do yield bs
+                                                                      p')
  
  -- | Splits a 'Producer' after the given number of characters
  splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs

index 05d9887c0459e2affc0b1cd34c43eef3ff7537f4..73d6fa487e4b4fb4491c7f680fd4dbc0f91b037c 100644 (file)
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -1,11 +1,10 @@
  {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
      UnliftedFFITypes #-}
--- This module lifts material from Brian O'Sullivan's text package 
+-- This module lifts assorted materials from Brian O'Sullivan's text package 
  -- especially Data.Text.Encoding in order to define a pipes-appropriate
  -- streamDecodeUtf8
  module Pipes.Text.Internal 
      ( Decoding(..)
-    , streamDecodeUtf8With
      , streamDecodeUtf8
      ) where
  
@@ -20,6 +19,7 @@ import Data.Bits ((.&.))
  import Data.ByteString as B
  import Data.ByteString.Internal as B
  import Data.Text ()
+import qualified Data.Text as T
  import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
  import Data.Text.Internal (Text(..), safe, textP)
  import Data.Word (Word8, Word32)
@@ -56,94 +56,52 @@ instance Show Decoding where
  newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
  newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
  
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
--- encoded text that is known to be valid.
---
--- If the input contains any invalid UTF-8 data, an exception will be
--- thrown (either by this function or a continuation) that cannot be
--- caught in pure code.  For more control over the handling of invalid
--- data, use 'streamDecodeUtf8With'.
  streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
+streamDecodeUtf8 = decodeChunk B.empty 0 0
  
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
--- encoded text.
-streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With mErr = case mErr of 
-    Nothing    -> decodeWith False strictDecode 
-    Just onErr -> decodeWith True onErr 
- where
-  -- We create a slightly larger than necessary buffer to accommodate a
-  -- potential surrogate pair started in the last buffer
- decodeWith replace onErr = decodeChunk 0 0
-  where
-  decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
-  decodeChunk codepoint0 state0 bs@(PS fp off len) =
-    runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+decodeChunk old codepoint0 state0 bs@(PS fp off len) = 
+                  runST $ do marray <- A.new (len+1) 
+                             unsafeIOToST (decodeChunkToBuffer marray)
     where
-    decodeChunkToBuffer :: A.MArray s -> IO Decoding
-    decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-      with (0::CSize) $ \destOffPtr ->
-      with codepoint0 $ \codepointPtr ->
-      with state0 $ \statePtr ->
-      with nullPtr $ \curPtrPtr ->
-        let end = ptr `plusPtr` (off + len)
-            loop curPtr = do
-              poke curPtrPtr curPtr
-              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
-                         curPtrPtr end codepointPtr statePtr
-              state <- peek statePtr
-              case state of
-                UTF8_REJECT ->  
-                  -- We encountered an encoding error
-                 if replace 
-                 then do 
-                  x <- peek curPtr'
-                  case onErr desc (Just x) of
-                    Nothing -> loop $ curPtr' `plusPtr` 1
-                    Just c -> do
-                      destOff <- peek destOffPtr
-                      w <- unsafeSTToIO $
-                           unsafeWrite dest (fromIntegral destOff) (safe c)
-                      poke destOffPtr (destOff + fromIntegral w)
-                      poke statePtr 0
-                      loop $ curPtr' `plusPtr` 1
-                 else do 
-                  n <- peek destOffPtr 
-                  chunkText <- unsafeSTToIO $ do
-                      arr <- A.unsafeFreeze dest
-                      return $! textP arr 0 (fromIntegral n)
-                  lastPtr <- peek curPtrPtr
-                  let left = lastPtr `minusPtr` curPtr
-                  return $ Other chunkText (B.drop left bs)
-                _ -> do
-                  -- We encountered the end of the buffer while decoding
-                  n <- peek destOffPtr
-                  codepoint <- peek codepointPtr
-                  chunkText <- unsafeSTToIO $ do
-                      arr <- A.unsafeFreeze dest
-                      return $! textP arr 0 (fromIntegral n)
-                  lastPtr <- peek curPtrPtr
-                  let left = lastPtr `minusPtr` curPtr
-                  return $ Some chunkText (B.drop left bs)
-                           (decodeChunk codepoint state)
-        in loop (ptr `plusPtr` off)
-  desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
+   decodeChunkToBuffer :: A.MArray s -> IO Decoding
+   decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+     with (0::CSize) $ \destOffPtr ->
+     with codepoint0 $ \codepointPtr ->
+     with state0     $ \statePtr ->
+     with nullPtr    $ \curPtrPtr ->
+       do let end = ptr `plusPtr` (off + len)
+              curPtr = ptr `plusPtr` off
+          poke curPtrPtr curPtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+          state <- peek statePtr
+          lastPtr <- peek curPtrPtr
+          codepoint <- peek codepointPtr
+          n <- peek destOffPtr
+          chunkText <- mkText dest n
+          let left      = lastPtr `minusPtr` curPtr
+              remaining = B.drop left bs
+              accum = if T.null chunkText then B.append old remaining  else remaining 
+          return $ case state of 
+            UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+            _ ->           Some  chunkText accum (decodeChunk accum codepoint state)
+
+
+mkText :: A.MArray s -> CSize -> IO Text
+mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                   return $! textP arr 0 (fromIntegral n)
  
  ord :: Char -> Int
  ord (C# c#) = I# (ord# c#)
  {-# INLINE ord #-}
  
-
  unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
  unsafeWrite marr i c
-    | n < 0x10000 = do 
-        A.unsafeWrite marr i (fromIntegral n)
-        return 1
-    | otherwise = do
-        A.unsafeWrite marr i lo
-        A.unsafeWrite marr (i+1) hi
-        return 2
+    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
+                       return 1
+    | otherwise   = do A.unsafeWrite marr i lo
+                       A.unsafeWrite marr (i+1) hi
+                       return 2
      where n = ord c
            m = n - 0x10000
            lo = fromIntegral $ (m `shiftR` 10) + 0xD800
diff --git a/test/Test.hs b/test/Test.hs

index 1579f2b154172a80f92d3f25215f872db0e08a5b..66351d1611d567a54df6e4d7f29b221372008596 100644 (file)
--- a/test/Test.hs
+++ b/test/Test.hs
@@ -8,6 +8,7 @@ import Test.Framework.Providers.QuickCheck2 (testProperty)
  import Control.Exception (catch)
  import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
  import Data.Monoid (Monoid(..))
+import Control.Monad
  import Data.String (fromString)
  import Data.Text.Encoding.Error
  import qualified Data.List as L
@@ -19,15 +20,20 @@ import qualified Data.Text as T
  import qualified Data.Text.Lazy as TL
  import qualified Data.Text.Encoding as E
  import qualified Pipes.Text.Internal as PE
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP 
+import qualified Pipes as P 
  
+
+import Debug.Trace
  main :: IO ()
  main = defaultMain [tests]
  -- >>> :main  -a 10000
  
  tests = testGroup "stream_decode" [
-
-  testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
-  testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed]
+  -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
+  testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed,
+   testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
  
  t_utf8_incr_valid  = do
          Positive n <- arbitrary
@@ -43,18 +49,46 @@ t_utf8_incr_valid  = do
                                PE.Some t _ f' -> t : feedChunksOf n f' b
                                _             -> []
  
-t_utf8_incr_mixed  = do    
-       Positive n <- arbitrary  
+t_utf8_incr_mixed  = do
+       Positive n <- arbitrary
         txt <- genUnicode
-       forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt
+       let chunkSize = mod n 7 + 1
+       forAll (vector 9) $ 
+              (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
      where 
      roundtrip :: [B.ByteString] -> B.ByteString
-    roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where                                                      
-       go dec acc old [] = acc <> old
-       go dec acc old (bs:bss) = case dec bs of 
-         PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss
-                                           else go dec' (acc <> E.encodeUtf8 t) new bss
-         PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss 
-                                       else acc <> E.encodeUtf8 t <> bs' <> B.concat bss 
+    roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where    
+       go dec acc [] = acc   
+       go dec acc [bs]  = case dec bs of 
+          PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
+          PE.Other t bs'   -> acc <> E.encodeUtf8 t <> bs' 
+       go dec acc (bs:bss) = case dec bs of 
+         PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
+         PE.Other t bs'   -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
+    chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+
+
+
+
+t_utf8_incr_pipe  = do    
+       Positive  m <- arbitrary
+       Positive n  <- arbitrary  
+       txt         <- genUnicode
+       let chunkSize = mod n 7 + 1
+           bytesLength = mod 20 m
+       forAll (vector bytesLength) $ 
+              (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt) 
+              `eq` 
+              appendBytes txt
+    where 
+    roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
+    roundtrip p = do pbs <- TP.decodeUtf8 p P.>-> TP.encodeUtf8
+                     pbs
      chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
      appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+
+
+
+
+
author	michaelt <what_is_it_to_do_anything@yahoo.com>
	Thu, 26 Dec 2013 03:25:07 +0000 (22:25 -0500)
committer	michaelt <what_is_it_to_do_anything@yahoo.com>
	Thu, 26 Dec 2013 03:25:07 +0000 (22:25 -0500)
Pipes/Text.hs		patch \| blob \| blame \| history
Pipes/Text/Internal.hs		patch \| blob \| blame \| history
test/Test.hs		patch \| blob \| blame \| history