scrap character replacement; simplify

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2013-12-25 22:25:07 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2013-12-25 22:25:07 -0500
commit: c9d1c945a4343d756533b85060c35c04be0c8b02 (patch)
tree: 0c31ae2e13002c1311ae3cc1e15608750c5a0a9c
parent: 8c48280926efffc0ca52a5d9ca796d639d053379 (diff)
download: text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.gz
text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.zst
text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.zip
3 files changed, 103 insertions, 171 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 6845dd3..d62aee7 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-}
+{-# LANGUAGE RankNTypes, TypeFamilies, NoMonomorphismRestriction #-}
 {-| This module provides @pipes@ utilities for \"text streams\", which are
    streams of 'Text' chunks.  The individual chunks are uniformly @strict@, but 
@@ -116,7 +116,6 @@ module Pipes.Text  (
    lines,
    words,
    decodeUtf8,
-    decodeUtf8With,
    -- * Transformations
    intersperse,
    
@@ -209,7 +208,7 @@ fromLazy  = foldrChunks (\e a -> yield e >> a) (return ())
 {-# INLINABLE fromLazy #-}
 -- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
+stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
 stdin = fromHandle IO.stdin
 {-# INLINABLE stdin #-}
@@ -217,52 +216,17 @@ stdin = fromHandle IO.stdin
    determined by the good sense of the text library. 
 -}
-fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
+fromHandle h = decodeUtf8 (PB.fromHandle h)
-- if only so that mistakes can be concentrated in one place.
-- This modifies something that was faster on an earlier iteration.
-- Note also that the `text` replacement system is being ignored;
-- with a replacement scheme one could have `Producer Text m ()`
-- the relation to the replacement business needs to be thought out.
-- The complicated type seems overmuch for the toy stdin above
-fromHandle h = go PE.streamDecodeUtf8 B.empty where
-  act = B.hGetSome h defaultChunkSize
-  go dec old = do chunk <- liftIO act
-                  if B.null chunk 
-                    then if B.null old then return (return ())
-                                       else return (yield old >> return ())
-                    else case dec chunk of 
-                           PE.Some text bs dec' -> 
-                              if T.null text then go dec' (B.append old bs) 
-                                             else do yield text
-                                                     go dec' B.empty
-                           PE.Other text bs ->
-                              if T.null text then return (do yield old
-                                                             yield bs
-                                                             PB.fromHandle h)
-                                             else do yield text
-                                                     return (do yield bs
-                                                                PB.fromHandle h)
 {-# INLINE fromHandle#-}
-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
-- the dedicated Text IO function 'hGetChunk' ;
-- this way "runEffect $ PT.fromHandle hIn  >->  PT.toHandle hOut"
-- runs the same as the conduit equivalent, only slightly slower 
-- than "runEffect $ PB.fromHandle hIn  >->  PB.toHandle hOut"
-- #else
-- fromHandle h = go where
--     go = do txt <- liftIO (T.hGetChunk h)
--             unless (T.null txt) $ do yield txt
--                                      go
-- {-# INLINABLE fromHandle#-}
-- #endif
 {-| Stream text from a file using Pipes.Safe
 >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
 MAIN = PUTSTRLN "HELLO WORLD"
 -}
-readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
+readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
 readFile file = Safe.withFile file IO.ReadMode fromHandle
 {-# INLINABLE readFile #-}
@@ -338,7 +302,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
 -- | Stream text into a file. Uses @pipes-safe@.
-writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m ()
+writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
 writeFile file = Safe.withFile file IO.WriteMode toHandle
 -- | Apply a transformation to each 'Char' in the stream
@@ -624,42 +588,18 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
 -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
 -- into a Pipe of Text
-decodeUtf8
-  :: Monad m
-  => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8 = decodeUtf8With Nothing
-{-# INLINEABLE decodeUtf8 #-}
-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
-decodeUtf8With
-  :: Monad m  
-  => Maybe TE.OnDecodeError 
-  -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where 
-  go dec old p = do
-    x <- lift (next p)
-    case x of
-      Left r -> if B.null old then return (return r)
-                              else return (do yield old 
-                                              return r)
-      Right (chunk, p') -> 
-        case dec chunk of 
-          PE.Some text l dec' -> 
-            if T.null text then go dec' (B.append old l) p'
-                           else do yield text
-                                   go dec' B.empty p'
-          PE.Other text bs ->
-            if T.null text then return (do yield old 
-                                           yield bs
-                                           p')
-                           else do yield text
-                                   return (do yield bs
-                                              p')
-{-# INLINEABLE decodeUtf8With #-}
+decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8 = go PE.streamDecodeUtf8 where
+  go dec0 p = do 
+     x <- lift (next p)
+     case x of Left r -> return (return r)
+               Right (chunk, p') -> 
+                 case dec0 chunk of PE.Some text _ dec -> do yield text
+                                                             go dec p'
+                                    PE.Other text bs -> do yield text
+                                                           return (do yield bs
+                                                                      p')
 -- | Splits a 'Producer' after the given number of characters
 splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 05d9887..73d6fa4 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -1,11 +1,10 @@
 {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
    UnliftedFFITypes #-}
-- This module lifts material from Brian O'Sullivan's text package 
+-- This module lifts assorted materials from Brian O'Sullivan's text package 
 -- especially Data.Text.Encoding in order to define a pipes-appropriate
 -- streamDecodeUtf8
 module Pipes.Text.Internal 
    ( Decoding(..)
-    , streamDecodeUtf8With
    , streamDecodeUtf8
    ) where
@@ -20,6 +19,7 @@ import Data.Bits ((.&.))
 import Data.ByteString as B
 import Data.ByteString.Internal as B
 import Data.Text ()
+import qualified Data.Text as T
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal (Text(..), safe, textP)
 import Data.Word (Word8, Word32)
@@ -56,94 +56,52 @@ instance Show Decoding where
 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text that is known to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown (either by this function or a continuation) that cannot be
-- caught in pure code.  For more control over the handling of invalid
-- data, use 'streamDecodeUtf8With'.
 streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
+streamDecodeUtf8 = decodeChunk B.empty 0 0
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
-- encoded text.
+decodeChunk old codepoint0 state0 bs@(PS fp off len) = 
-streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
+                  runST $ do marray <- A.new (len+1) 
-streamDecodeUtf8With mErr = case mErr of 
+                             unsafeIOToST (decodeChunkToBuffer marray)
-    Nothing    -> decodeWith False strictDecode 
-    Just onErr -> decodeWith True onErr 
- where
-  -- We create a slightly larger than necessary buffer to accommodate a
-  -- potential surrogate pair started in the last buffer
- decodeWith replace onErr = decodeChunk 0 0
-  where
-  decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
-  decodeChunk codepoint0 state0 bs@(PS fp off len) =
-    runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
   where
-    decodeChunkToBuffer :: A.MArray s -> IO Decoding
+   decodeChunkToBuffer :: A.MArray s -> IO Decoding
-    decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+   decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-      with (0::CSize) $ \destOffPtr ->
+     with (0::CSize) $ \destOffPtr ->
-      with codepoint0 $ \codepointPtr ->
+     with codepoint0 $ \codepointPtr ->
-      with state0 $ \statePtr ->
+     with state0     $ \statePtr ->
-      with nullPtr $ \curPtrPtr ->
+     with nullPtr    $ \curPtrPtr ->
-        let end = ptr `plusPtr` (off + len)
+       do let end = ptr `plusPtr` (off + len)
-            loop curPtr = do
+              curPtr = ptr `plusPtr` off
-              poke curPtrPtr curPtr
+          poke curPtrPtr curPtr
-              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
-                         curPtrPtr end codepointPtr statePtr
+          state <- peek statePtr
-              state <- peek statePtr
+          lastPtr <- peek curPtrPtr
-              case state of
+          codepoint <- peek codepointPtr
-                UTF8_REJECT ->  
+          n <- peek destOffPtr
-                  -- We encountered an encoding error
+          chunkText <- mkText dest n
-                 if replace 
+          let left      = lastPtr `minusPtr` curPtr
-                 then do 
+              remaining = B.drop left bs
-                  x <- peek curPtr'
+              accum = if T.null chunkText then B.append old remaining  else remaining 
-                  case onErr desc (Just x) of
+          return $ case state of 
-                    Nothing -> loop $ curPtr' `plusPtr` 1
+            UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
-                    Just c -> do
+            _ ->           Some  chunkText accum (decodeChunk accum codepoint state)
-                      destOff <- peek destOffPtr
-                      w <- unsafeSTToIO $
-                           unsafeWrite dest (fromIntegral destOff) (safe c)
+mkText :: A.MArray s -> CSize -> IO Text
-                      poke destOffPtr (destOff + fromIntegral w)
+mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                      poke statePtr 0
+                                   return $! textP arr 0 (fromIntegral n)
-                      loop $ curPtr' `plusPtr` 1
-                 else do 
-                  n <- peek destOffPtr 
-                  chunkText <- unsafeSTToIO $ do
-                      arr <- A.unsafeFreeze dest
-                      return $! textP arr 0 (fromIntegral n)
-                  lastPtr <- peek curPtrPtr
-                  let left = lastPtr `minusPtr` curPtr
-                  return $ Other chunkText (B.drop left bs)
-                _ -> do
-                  -- We encountered the end of the buffer while decoding
-                  n <- peek destOffPtr
-                  codepoint <- peek codepointPtr
-                  chunkText <- unsafeSTToIO $ do
-                      arr <- A.unsafeFreeze dest
-                      return $! textP arr 0 (fromIntegral n)
-                  lastPtr <- peek curPtrPtr
-                  let left = lastPtr `minusPtr` curPtr
-                  return $ Some chunkText (B.drop left bs)
-                           (decodeChunk codepoint state)
-        in loop (ptr `plusPtr` off)
-  desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
 ord :: Char -> Int
 ord (C# c#) = I# (ord# c#)
 {-# INLINE ord #-}
 unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
 unsafeWrite marr i c
-    | n < 0x10000 = do 
+    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
-        A.unsafeWrite marr i (fromIntegral n)
+                       return 1
-        return 1
+    | otherwise   = do A.unsafeWrite marr i lo
-    | otherwise = do
+                       A.unsafeWrite marr (i+1) hi
-        A.unsafeWrite marr i lo
+                       return 2
-        A.unsafeWrite marr (i+1) hi
-        return 2
    where n = ord c
          m = n - 0x10000
          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
diff --git a/test/Test.hs b/test/Test.hs
index 1579f2b..66351d1 100644
--- a/test/Test.hs
+++ b/test/Test.hs
@@ -8,6 +8,7 @@ import Test.Framework.Providers.QuickCheck2 (testProperty)
 import Control.Exception (catch)
 import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
 import Data.Monoid (Monoid(..))
+import Control.Monad
 import Data.String (fromString)
 import Data.Text.Encoding.Error
 import qualified Data.List as L
@@ -19,15 +20,20 @@ import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Encoding as E
 import qualified Pipes.Text.Internal as PE
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP 
+import qualified Pipes as P 
+import Debug.Trace
 main :: IO ()
 main = defaultMain [tests]
 -- >>> :main  -a 10000
 tests = testGroup "stream_decode" [
+  -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
-  testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
+  testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed,
-  testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed]
+   testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
 t_utf8_incr_valid  = do
        Positive n <- arbitrary
@@ -43,18 +49,46 @@ t_utf8_incr_valid  = do
                              PE.Some t _ f' -> t : feedChunksOf n f' b
                              _             -> []
-t_utf8_incr_mixed  = do    
+t_utf8_incr_mixed  = do
-       Positive n <- arbitrary  
+       Positive n <- arbitrary
       txt <- genUnicode
-       forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt
+       let chunkSize = mod n 7 + 1
+       forAll (vector 9) $ 
+              (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
    where 
    roundtrip :: [B.ByteString] -> B.ByteString
-    roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where                                                      
+    roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where    
-       go dec acc old [] = acc <> old
+       go dec acc [] = acc   
-       go dec acc old (bs:bss) = case dec bs of 
+       go dec acc [bs]  = case dec bs of 
-         PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss
+          PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
-                                           else go dec' (acc <> E.encodeUtf8 t) new bss
+          PE.Other t bs'   -> acc <> E.encodeUtf8 t <> bs' 
-         PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss 
+       go dec acc (bs:bss) = case dec bs of 
-                                       else acc <> E.encodeUtf8 t <> bs' <> B.concat bss 
+         PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
+         PE.Other t bs'   -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
+    chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
+    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+t_utf8_incr_pipe  = do    
+       Positive  m <- arbitrary
+       Positive n  <- arbitrary  
+       txt         <- genUnicode
+       let chunkSize = mod n 7 + 1
+           bytesLength = mod 20 m
+       forAll (vector bytesLength) $ 
+              (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt) 
+              `eq` 
+              appendBytes txt
+    where 
+    roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
+    roundtrip p = do pbs <- TP.decodeUtf8 p P.>-> TP.encodeUtf8
+                     pbs
    chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
    appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2013-12-25 22:25:07 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2013-12-25 22:25:07 -0500
commit	c9d1c945a4343d756533b85060c35c04be0c8b02 (patch)
tree	0c31ae2e13002c1311ae3cc1e15608750c5a0a9c
parent	8c48280926efffc0ca52a5d9ca796d639d053379 (diff)
download	text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.gz text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.zst text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.zip

diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 6845dd3..d62aee7 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
1	{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-}	1	{-# LANGUAGE RankNTypes, TypeFamilies, NoMonomorphismRestriction #-}
2		2
3	{-\| This module provides @pipes@ utilities for \"text streams\", which are	3	{-\| This module provides @pipes@ utilities for \"text streams\", which are
4	streams of 'Text' chunks. The individual chunks are uniformly @strict@, but	4	streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
@@ -116,7 +116,6 @@ module Pipes.Text (
116	lines,	116	lines,
117	words,	117	words,
118	decodeUtf8,	118	decodeUtf8,
119	decodeUtf8With,
120	-- * Transformations	119	-- * Transformations
121	intersperse,	120	intersperse,
122		121
@@ -209,7 +208,7 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
209	{-# INLINABLE fromLazy #-}	208	{-# INLINABLE fromLazy #-}
210		209
211	-- \| Stream text from 'stdin'	210	-- \| Stream text from 'stdin'
212	stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())	211	stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
213	stdin = fromHandle IO.stdin	212	stdin = fromHandle IO.stdin
214	{-# INLINABLE stdin #-}	213	{-# INLINABLE stdin #-}
215		214
@@ -217,52 +216,17 @@ stdin = fromHandle IO.stdin
217	determined by the good sense of the text library.	216	determined by the good sense of the text library.
218	-}	217	-}
219		218
220	fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())	219	fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
221	-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`	220	fromHandle h = decodeUtf8 (PB.fromHandle h)
222	-- if only so that mistakes can be concentrated in one place.
223	-- This modifies something that was faster on an earlier iteration.
224	-- Note also that the `text` replacement system is being ignored;
225	-- with a replacement scheme one could have `Producer Text m ()`
226	-- the relation to the replacement business needs to be thought out.
227	-- The complicated type seems overmuch for the toy stdin above
228	fromHandle h = go PE.streamDecodeUtf8 B.empty where
229	act = B.hGetSome h defaultChunkSize
230	go dec old = do chunk <- liftIO act
231	if B.null chunk
232	then if B.null old then return (return ())
233	else return (yield old >> return ())
234	else case dec chunk of
235	PE.Some text bs dec' ->
236	if T.null text then go dec' (B.append old bs)
237	else do yield text
238	go dec' B.empty
239	PE.Other text bs ->
240	if T.null text then return (do yield old
241	yield bs
242	PB.fromHandle h)
243	else do yield text
244	return (do yield bs
245	PB.fromHandle h)
246	{-# INLINE fromHandle#-}	221	{-# INLINE fromHandle#-}
247	-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as	222
248	-- the dedicated Text IO function 'hGetChunk' ;
249	-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
250	-- runs the same as the conduit equivalent, only slightly slower
251	-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
252	-- #else
253	-- fromHandle h = go where
254	-- go = do txt <- liftIO (T.hGetChunk h)
255	-- unless (T.null txt) $ do yield txt
256	-- go
257	-- {-# INLINABLE fromHandle#-}
258	-- #endif
259	{-\| Stream text from a file using Pipes.Safe	223	{-\| Stream text from a file using Pipes.Safe
260		224
261	>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout	225	>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
262	MAIN = PUTSTRLN "HELLO WORLD"	226	MAIN = PUTSTRLN "HELLO WORLD"
263	-}	227	-}
264		228
265	readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())	229	readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
266	readFile file = Safe.withFile file IO.ReadMode fromHandle	230	readFile file = Safe.withFile file IO.ReadMode fromHandle
267	{-# INLINABLE readFile #-}	231	{-# INLINABLE readFile #-}
268		232
@@ -338,7 +302,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
338		302
339		303
340	-- \| Stream text into a file. Uses @pipes-safe@.	304	-- \| Stream text into a file. Uses @pipes-safe@.
341	writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m ()	305	writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
342	writeFile file = Safe.withFile file IO.WriteMode toHandle	306	writeFile file = Safe.withFile file IO.WriteMode toHandle
343		307
344	-- \| Apply a transformation to each 'Char' in the stream	308	-- \| Apply a transformation to each 'Char' in the stream
@@ -624,42 +588,18 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
624		588
625	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded	589	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
626	-- into a Pipe of Text	590	-- into a Pipe of Text
627	decodeUtf8
628	:: Monad m
629	=> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
630	decodeUtf8 = decodeUtf8With Nothing
631	{-# INLINEABLE decodeUtf8 #-}
632
633	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
634	-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
635	-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
636	decodeUtf8With
637	:: Monad m
638	=> Maybe TE.OnDecodeError
639	-> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
640	decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
641	go dec old p = do
642	x <- lift (next p)
643	case x of
644	Left r -> if B.null old then return (return r)
645	else return (do yield old
646	return r)
647	Right (chunk, p') ->
648	case dec chunk of
649	PE.Some text l dec' ->
650	if T.null text then go dec' (B.append old l) p'
651	else do yield text
652	go dec' B.empty p'
653	PE.Other text bs ->
654	if T.null text then return (do yield old
655	yield bs
656	p')
657	else do yield text
658	return (do yield bs
659	p')
660	{-# INLINEABLE decodeUtf8With #-}
661
662		591
		592	decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
		593	decodeUtf8 = go PE.streamDecodeUtf8 where
		594	go dec0 p = do
		595	x <- lift (next p)
		596	case x of Left r -> return (return r)
		597	Right (chunk, p') ->
		598	case dec0 chunk of PE.Some text _ dec -> do yield text
		599	go dec p'
		600	PE.Other text bs -> do yield text
		601	return (do yield bs
		602	p')
663		603
664	-- \| Splits a 'Producer' after the given number of characters	604	-- \| Splits a 'Producer' after the given number of characters
665	splitAt	605	splitAt


diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 05d9887..73d6fa4 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs
@@ -1,11 +1,10 @@
1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,	1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2	UnliftedFFITypes #-}	2	UnliftedFFITypes #-}
3	-- This module lifts material from Brian O'Sullivan's text package	3	-- This module lifts assorted materials from Brian O'Sullivan's text package
4	-- especially Data.Text.Encoding in order to define a pipes-appropriate	4	-- especially Data.Text.Encoding in order to define a pipes-appropriate
5	-- streamDecodeUtf8	5	-- streamDecodeUtf8
6	module Pipes.Text.Internal	6	module Pipes.Text.Internal
7	( Decoding(..)	7	( Decoding(..)
8	, streamDecodeUtf8With
9	, streamDecodeUtf8	8	, streamDecodeUtf8
10	) where	9	) where
11		10
@@ -20,6 +19,7 @@ import Data.Bits ((.&.))
20	import Data.ByteString as B	19	import Data.ByteString as B
21	import Data.ByteString.Internal as B	20	import Data.ByteString.Internal as B
22	import Data.Text ()	21	import Data.Text ()
		22	import qualified Data.Text as T
23	import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)	23	import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24	import Data.Text.Internal (Text(..), safe, textP)	24	import Data.Text.Internal (Text(..), safe, textP)
25	import Data.Word (Word8, Word32)	25	import Data.Word (Word8, Word32)
@@ -56,94 +56,52 @@ instance Show Decoding where
56	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)	56	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)	57	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58		58
59	-- \| Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60	-- encoded text that is known to be valid.
61	--
62	-- If the input contains any invalid UTF-8 data, an exception will be
63	-- thrown (either by this function or a continuation) that cannot be
64	-- caught in pure code. For more control over the handling of invalid
65	-- data, use 'streamDecodeUtf8With'.
66	streamDecodeUtf8 :: ByteString -> Decoding	59	streamDecodeUtf8 :: ByteString -> Decoding
67	streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)	60	streamDecodeUtf8 = decodeChunk B.empty 0 0
68		61
69	-- \| Decode, in a stream oriented way, a 'ByteString' containing UTF-8	62	decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
70	-- encoded text.	63	decodeChunk old codepoint0 state0 bs@(PS fp off len) =
71	streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding	64	runST $ do marray <- A.new (len+1)
72	streamDecodeUtf8With mErr = case mErr of	65	unsafeIOToST (decodeChunkToBuffer marray)
73	Nothing -> decodeWith False strictDecode
74	Just onErr -> decodeWith True onErr
75	where
76	-- We create a slightly larger than necessary buffer to accommodate a
77	-- potential surrogate pair started in the last buffer
78	decodeWith replace onErr = decodeChunk 0 0
79	where
80	decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81	decodeChunk codepoint0 state0 bs@(PS fp off len) =
82	runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83	where	66	where
84	decodeChunkToBuffer :: A.MArray s -> IO Decoding	67	decodeChunkToBuffer :: A.MArray s -> IO Decoding
85	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->	68	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86	with (0::CSize) $ \destOffPtr ->	69	with (0::CSize) $ \destOffPtr ->
87	with codepoint0 $ \codepointPtr ->	70	with codepoint0 $ \codepointPtr ->
88	with state0 $ \statePtr ->	71	with state0 $ \statePtr ->
89	with nullPtr $ \curPtrPtr ->	72	with nullPtr $ \curPtrPtr ->
90	let end = ptr `plusPtr` (off + len)	73	do let end = ptr `plusPtr` (off + len)
91	loop curPtr = do	74	curPtr = ptr `plusPtr` off
92	poke curPtrPtr curPtr	75	poke curPtrPtr curPtr
93	curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr	76	c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
94	curPtrPtr end codepointPtr statePtr	77	state <- peek statePtr
95	state <- peek statePtr	78	lastPtr <- peek curPtrPtr
96	case state of	79	codepoint <- peek codepointPtr
97	UTF8_REJECT ->	80	n <- peek destOffPtr
98	-- We encountered an encoding error	81	chunkText <- mkText dest n
99	if replace	82	let left = lastPtr `minusPtr` curPtr
100	then do	83	remaining = B.drop left bs
101	x <- peek curPtr'	84	accum = if T.null chunkText then B.append old remaining else remaining
102	case onErr desc (Just x) of	85	return $ case state of
103	Nothing -> loop $ curPtr' `plusPtr` 1	86	UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
104	Just c -> do	87	_ -> Some chunkText accum (decodeChunk accum codepoint state)
105	destOff <- peek destOffPtr	88
106	w <- unsafeSTToIO $	89
107	unsafeWrite dest (fromIntegral destOff) (safe c)	90	mkText :: A.MArray s -> CSize -> IO Text
108	poke destOffPtr (destOff + fromIntegral w)	91	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
109	poke statePtr 0	92	return $! textP arr 0 (fromIntegral n)
110	loop $ curPtr' `plusPtr` 1
111	else do
112	n <- peek destOffPtr
113	chunkText <- unsafeSTToIO $ do
114	arr <- A.unsafeFreeze dest
115	return $! textP arr 0 (fromIntegral n)
116	lastPtr <- peek curPtrPtr
117	let left = lastPtr `minusPtr` curPtr
118	return $ Other chunkText (B.drop left bs)
119	_ -> do
120	-- We encountered the end of the buffer while decoding
121	n <- peek destOffPtr
122	codepoint <- peek codepointPtr
123	chunkText <- unsafeSTToIO $ do
124	arr <- A.unsafeFreeze dest
125	return $! textP arr 0 (fromIntegral n)
126	lastPtr <- peek curPtrPtr
127	let left = lastPtr `minusPtr` curPtr
128	return $ Some chunkText (B.drop left bs)
129	(decodeChunk codepoint state)
130	in loop (ptr `plusPtr` off)
131	desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132		93
133	ord :: Char -> Int	94	ord :: Char -> Int
134	ord (C# c#) = I# (ord# c#)	95	ord (C# c#) = I# (ord# c#)
135	{-# INLINE ord #-}	96	{-# INLINE ord #-}
136		97
137
138	unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int	98	unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139	unsafeWrite marr i c	99	unsafeWrite marr i c
140	\| n < 0x10000 = do	100	\| n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
141	A.unsafeWrite marr i (fromIntegral n)	101	return 1
142	return 1	102	\| otherwise = do A.unsafeWrite marr i lo
143	\| otherwise = do	103	A.unsafeWrite marr (i+1) hi
144	A.unsafeWrite marr i lo	104	return 2
145	A.unsafeWrite marr (i+1) hi
146	return 2
147	where n = ord c	105	where n = ord c
148	m = n - 0x10000	106	m = n - 0x10000
149	lo = fromIntegral $ (m `shiftR` 10) + 0xD800	107	lo = fromIntegral $ (m `shiftR` 10) + 0xD800


diff --git a/test/Test.hs b/test/Test.hs index 1579f2b..66351d1 100644 --- a/test/Test.hs +++ b/test/Test.hs
@@ -8,6 +8,7 @@ import Test.Framework.Providers.QuickCheck2 (testProperty)
8	import Control.Exception (catch)	8	import Control.Exception (catch)
9	import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)	9	import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
10	import Data.Monoid (Monoid(..))	10	import Data.Monoid (Monoid(..))
		11	import Control.Monad
11	import Data.String (fromString)	12	import Data.String (fromString)
12	import Data.Text.Encoding.Error	13	import Data.Text.Encoding.Error
13	import qualified Data.List as L	14	import qualified Data.List as L
@@ -19,15 +20,20 @@ import qualified Data.Text as T
19	import qualified Data.Text.Lazy as TL	20	import qualified Data.Text.Lazy as TL
20	import qualified Data.Text.Encoding as E	21	import qualified Data.Text.Encoding as E
21	import qualified Pipes.Text.Internal as PE	22	import qualified Pipes.Text.Internal as PE
		23	import qualified Pipes.Text as TP
		24	import qualified Pipes.ByteString as BP
		25	import qualified Pipes as P
22		26
		27
		28	import Debug.Trace
23	main :: IO ()	29	main :: IO ()
24	main = defaultMain [tests]	30	main = defaultMain [tests]
25	-- >>> :main -a 10000	31	-- >>> :main -a 10000
26		32
27	tests = testGroup "stream_decode" [	33	tests = testGroup "stream_decode" [
28		34	-- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
29	testProperty "t_utf8_incr_valid" t_utf8_incr_valid,	35	testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed,
30	testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed]	36	testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
31		37
32	t_utf8_incr_valid = do	38	t_utf8_incr_valid = do
33	Positive n <- arbitrary	39	Positive n <- arbitrary
@@ -43,18 +49,46 @@ t_utf8_incr_valid = do
43	PE.Some t _ f' -> t : feedChunksOf n f' b	49	PE.Some t _ f' -> t : feedChunksOf n f' b
44	_ -> []	50	_ -> []
45		51
46	t_utf8_incr_mixed = do	52	t_utf8_incr_mixed = do
47	Positive n <- arbitrary	53	Positive n <- arbitrary
48	txt <- genUnicode	54	txt <- genUnicode
49	forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt	55	let chunkSize = mod n 7 + 1
		56	forAll (vector 9) $
		57	(roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
50	where	58	where
51	roundtrip :: [B.ByteString] -> B.ByteString	59	roundtrip :: [B.ByteString] -> B.ByteString
52	roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where	60	roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where
53	go dec acc old [] = acc <> old	61	go dec acc [] = acc
54	go dec acc old (bs:bss) = case dec bs of	62	go dec acc [bs] = case dec bs of
55	PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss	63	PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
56	else go dec' (acc <> E.encodeUtf8 t) new bss	64	PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs'
57	PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss	65	go dec acc (bs:bss) = case dec bs of
58	else acc <> E.encodeUtf8 t <> bs' <> B.concat bss	66	PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
		67	PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
		68	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
		69	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
		70
		71
		72
		73
		74	t_utf8_incr_pipe = do
		75	Positive m <- arbitrary
		76	Positive n <- arbitrary
		77	txt <- genUnicode
		78	let chunkSize = mod n 7 + 1
		79	bytesLength = mod 20 m
		80	forAll (vector bytesLength) $
		81	(BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt)
		82	`eq`
		83	appendBytes txt
		84	where
		85	roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
		86	roundtrip p = do pbs <- TP.decodeUtf8 p P.>-> TP.encodeUtf8
		87	pbs
59	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b	88	chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
60	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append	89	appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
		90
		91
		92
		93
		94