From c9d1c945a4343d756533b85060c35c04be0c8b02 Mon Sep 17 00:00:00 2001 From: michaelt Date: Wed, 25 Dec 2013 22:25:07 -0500 Subject: scrap character replacement; simplify --- Pipes/Text.hs | 96 ++++++++-------------------------------- Pipes/Text/Internal.hs | 118 ++++++++++++++++--------------------------------- test/Test.hs | 60 +++++++++++++++++++------ 3 files changed, 103 insertions(+), 171 deletions(-) diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 6845dd3..d62aee7 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-} +{-# LANGUAGE RankNTypes, TypeFamilies, NoMonomorphismRestriction #-} {-| This module provides @pipes@ utilities for \"text streams\", which are streams of 'Text' chunks. The individual chunks are uniformly @strict@, but @@ -116,7 +116,6 @@ module Pipes.Text ( lines, words, decodeUtf8, - decodeUtf8With, -- * Transformations intersperse, @@ -209,7 +208,7 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) {-# INLINABLE fromLazy #-} -- | Stream text from 'stdin' -stdin :: MonadIO m => Producer' Text m (Producer ByteString m ()) +stdin :: MonadIO m => Producer Text m (Producer ByteString m ()) stdin = fromHandle IO.stdin {-# INLINABLE stdin #-} @@ -217,52 +216,17 @@ stdin = fromHandle IO.stdin determined by the good sense of the text library. -} -fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ()) --- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)` --- if only so that mistakes can be concentrated in one place. --- This modifies something that was faster on an earlier iteration. --- Note also that the `text` replacement system is being ignored; --- with a replacement scheme one could have `Producer Text m ()` --- the relation to the replacement business needs to be thought out. --- The complicated type seems overmuch for the toy stdin above -fromHandle h = go PE.streamDecodeUtf8 B.empty where - act = B.hGetSome h defaultChunkSize - go dec old = do chunk <- liftIO act - if B.null chunk - then if B.null old then return (return ()) - else return (yield old >> return ()) - else case dec chunk of - PE.Some text bs dec' -> - if T.null text then go dec' (B.append old bs) - else do yield text - go dec' B.empty - PE.Other text bs -> - if T.null text then return (do yield old - yield bs - PB.fromHandle h) - else do yield text - return (do yield bs - PB.fromHandle h) +fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ()) +fromHandle h = decodeUtf8 (PB.fromHandle h) {-# INLINE fromHandle#-} --- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as --- the dedicated Text IO function 'hGetChunk' ; --- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" --- runs the same as the conduit equivalent, only slightly slower --- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" --- #else --- fromHandle h = go where --- go = do txt <- liftIO (T.hGetChunk h) --- unless (T.null txt) $ do yield txt --- go --- {-# INLINABLE fromHandle#-} --- #endif + {-| Stream text from a file using Pipes.Safe >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout MAIN = PUTSTRLN "HELLO WORLD" -} -readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ()) +readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ()) readFile file = Safe.withFile file IO.ReadMode fromHandle {-# INLINABLE readFile #-} @@ -338,7 +302,7 @@ toHandle h = for cat (liftIO . T.hPutStr h) -- | Stream text into a file. Uses @pipes-safe@. -writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m () +writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () writeFile file = Safe.withFile file IO.WriteMode toHandle -- | Apply a transformation to each 'Char' in the stream @@ -624,42 +588,18 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded -- into a Pipe of Text -decodeUtf8 - :: Monad m - => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8 = decodeUtf8With Nothing -{-# INLINEABLE decodeUtf8 #-} - --- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded --- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ --- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" -decodeUtf8With - :: Monad m - => Maybe TE.OnDecodeError - -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where - go dec old p = do - x <- lift (next p) - case x of - Left r -> if B.null old then return (return r) - else return (do yield old - return r) - Right (chunk, p') -> - case dec chunk of - PE.Some text l dec' -> - if T.null text then go dec' (B.append old l) p' - else do yield text - go dec' B.empty p' - PE.Other text bs -> - if T.null text then return (do yield old - yield bs - p') - else do yield text - return (do yield bs - p') -{-# INLINEABLE decodeUtf8With #-} - +decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8 = go PE.streamDecodeUtf8 where + go dec0 p = do + x <- lift (next p) + case x of Left r -> return (return r) + Right (chunk, p') -> + case dec0 chunk of PE.Some text _ dec -> do yield text + go dec p' + PE.Other text bs -> do yield text + return (do yield bs + p') -- | Splits a 'Producer' after the given number of characters splitAt diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 05d9887..73d6fa4 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs @@ -1,11 +1,10 @@ {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} --- This module lifts material from Brian O'Sullivan's text package +-- This module lifts assorted materials from Brian O'Sullivan's text package -- especially Data.Text.Encoding in order to define a pipes-appropriate -- streamDecodeUtf8 module Pipes.Text.Internal ( Decoding(..) - , streamDecodeUtf8With , streamDecodeUtf8 ) where @@ -20,6 +19,7 @@ import Data.Bits ((.&.)) import Data.ByteString as B import Data.ByteString.Internal as B import Data.Text () +import qualified Data.Text as T import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal (Text(..), safe, textP) import Data.Word (Word8, Word32) @@ -56,94 +56,52 @@ instance Show Decoding where newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 --- encoded text that is known to be valid. --- --- If the input contains any invalid UTF-8 data, an exception will be --- thrown (either by this function or a continuation) that cannot be --- caught in pure code. For more control over the handling of invalid --- data, use 'streamDecodeUtf8With'. streamDecodeUtf8 :: ByteString -> Decoding -streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) +streamDecodeUtf8 = decodeChunk B.empty 0 0 --- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 --- encoded text. -streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding -streamDecodeUtf8With mErr = case mErr of - Nothing -> decodeWith False strictDecode - Just onErr -> decodeWith True onErr - where - -- We create a slightly larger than necessary buffer to accommodate a - -- potential surrogate pair started in the last buffer - decodeWith replace onErr = decodeChunk 0 0 - where - decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding - decodeChunk codepoint0 state0 bs@(PS fp off len) = - runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) +decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding +decodeChunk old codepoint0 state0 bs@(PS fp off len) = + runST $ do marray <- A.new (len+1) + unsafeIOToST (decodeChunkToBuffer marray) where - decodeChunkToBuffer :: A.MArray s -> IO Decoding - decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> - with codepoint0 $ \codepointPtr -> - with state0 $ \statePtr -> - with nullPtr $ \curPtrPtr -> - let end = ptr `plusPtr` (off + len) - loop curPtr = do - poke curPtrPtr curPtr - curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr - curPtrPtr end codepointPtr statePtr - state <- peek statePtr - case state of - UTF8_REJECT -> - -- We encountered an encoding error - if replace - then do - x <- peek curPtr' - case onErr desc (Just x) of - Nothing -> loop $ curPtr' `plusPtr` 1 - Just c -> do - destOff <- peek destOffPtr - w <- unsafeSTToIO $ - unsafeWrite dest (fromIntegral destOff) (safe c) - poke destOffPtr (destOff + fromIntegral w) - poke statePtr 0 - loop $ curPtr' `plusPtr` 1 - else do - n <- peek destOffPtr - chunkText <- unsafeSTToIO $ do - arr <- A.unsafeFreeze dest - return $! textP arr 0 (fromIntegral n) - lastPtr <- peek curPtrPtr - let left = lastPtr `minusPtr` curPtr - return $ Other chunkText (B.drop left bs) - _ -> do - -- We encountered the end of the buffer while decoding - n <- peek destOffPtr - codepoint <- peek codepointPtr - chunkText <- unsafeSTToIO $ do - arr <- A.unsafeFreeze dest - return $! textP arr 0 (fromIntegral n) - lastPtr <- peek curPtrPtr - let left = lastPtr `minusPtr` curPtr - return $ Some chunkText (B.drop left bs) - (decodeChunk codepoint state) - in loop (ptr `plusPtr` off) - desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" + decodeChunkToBuffer :: A.MArray s -> IO Decoding + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + with nullPtr $ \curPtrPtr -> + do let end = ptr `plusPtr` (off + len) + curPtr = ptr `plusPtr` off + poke curPtrPtr curPtr + c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr + state <- peek statePtr + lastPtr <- peek curPtrPtr + codepoint <- peek codepointPtr + n <- peek destOffPtr + chunkText <- mkText dest n + let left = lastPtr `minusPtr` curPtr + remaining = B.drop left bs + accum = if T.null chunkText then B.append old remaining else remaining + return $ case state of + UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error + _ -> Some chunkText accum (decodeChunk accum codepoint state) + + +mkText :: A.MArray s -> CSize -> IO Text +mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) ord :: Char -> Int ord (C# c#) = I# (ord# c#) {-# INLINE ord #-} - unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int unsafeWrite marr i c - | n < 0x10000 = do - A.unsafeWrite marr i (fromIntegral n) - return 1 - | otherwise = do - A.unsafeWrite marr i lo - A.unsafeWrite marr (i+1) hi - return 2 + | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) + return 1 + | otherwise = do A.unsafeWrite marr i lo + A.unsafeWrite marr (i+1) hi + return 2 where n = ord c m = n - 0x10000 lo = fromIntegral $ (m `shiftR` 10) + 0xD800 diff --git a/test/Test.hs b/test/Test.hs index 1579f2b..66351d1 100644 --- a/test/Test.hs +++ b/test/Test.hs @@ -8,6 +8,7 @@ import Test.Framework.Providers.QuickCheck2 (testProperty) import Control.Exception (catch) import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord) import Data.Monoid (Monoid(..)) +import Control.Monad import Data.String (fromString) import Data.Text.Encoding.Error import qualified Data.List as L @@ -19,15 +20,20 @@ import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as E import qualified Pipes.Text.Internal as PE +import qualified Pipes.Text as TP +import qualified Pipes.ByteString as BP +import qualified Pipes as P + +import Debug.Trace main :: IO () main = defaultMain [tests] -- >>> :main -a 10000 tests = testGroup "stream_decode" [ - - testProperty "t_utf8_incr_valid" t_utf8_incr_valid, - testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed] + -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, + testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed, + testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe] t_utf8_incr_valid = do Positive n <- arbitrary @@ -43,18 +49,46 @@ t_utf8_incr_valid = do PE.Some t _ f' -> t : feedChunksOf n f' b _ -> [] -t_utf8_incr_mixed = do - Positive n <- arbitrary +t_utf8_incr_mixed = do + Positive n <- arbitrary txt <- genUnicode - forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt + let chunkSize = mod n 7 + 1 + forAll (vector 9) $ + (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt) where roundtrip :: [B.ByteString] -> B.ByteString - roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where - go dec acc old [] = acc <> old - go dec acc old (bs:bss) = case dec bs of - PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss - else go dec' (acc <> E.encodeUtf8 t) new bss - PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss - else acc <> E.encodeUtf8 t <> bs' <> B.concat bss + roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where + go dec acc [] = acc + go dec acc [bs] = case dec bs of + PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l + PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' + go dec acc (bs:bss) = case dec bs of + PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss + PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss + chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b + appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append + + + + +t_utf8_incr_pipe = do + Positive m <- arbitrary + Positive n <- arbitrary + txt <- genUnicode + let chunkSize = mod n 7 + 1 + bytesLength = mod 20 m + forAll (vector bytesLength) $ + (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt) + `eq` + appendBytes txt + where + roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r + roundtrip p = do pbs <- TP.decodeUtf8 p P.>-> TP.encodeUtf8 + pbs chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append + + + + + -- cgit v1.2.3