-{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-}
+{-# LANGUAGE RankNTypes, TypeFamilies, NoMonomorphismRestriction #-}
{-| This module provides @pipes@ utilities for \"text streams\", which are
streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
lines,
words,
decodeUtf8,
- decodeUtf8With,
-- * Transformations
intersperse,
{-# INLINABLE fromLazy #-}
-- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
+stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
stdin = fromHandle IO.stdin
{-# INLINABLE stdin #-}
determined by the good sense of the text library.
-}
-fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
--- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
--- if only so that mistakes can be concentrated in one place.
--- This modifies something that was faster on an earlier iteration.
--- Note also that the `text` replacement system is being ignored;
--- with a replacement scheme one could have `Producer Text m ()`
--- the relation to the replacement business needs to be thought out.
--- The complicated type seems overmuch for the toy stdin above
-fromHandle h = go PE.streamDecodeUtf8 B.empty where
- act = B.hGetSome h defaultChunkSize
- go dec old = do chunk <- liftIO act
- if B.null chunk
- then if B.null old then return (return ())
- else return (yield old >> return ())
- else case dec chunk of
- PE.Some text bs dec' ->
- if T.null text then go dec' (B.append old bs)
- else do yield text
- go dec' B.empty
- PE.Other text bs ->
- if T.null text then return (do yield old
- yield bs
- PB.fromHandle h)
- else do yield text
- return (do yield bs
- PB.fromHandle h)
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
+fromHandle h = decodeUtf8 (PB.fromHandle h)
{-# INLINE fromHandle#-}
--- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
--- the dedicated Text IO function 'hGetChunk' ;
--- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
--- runs the same as the conduit equivalent, only slightly slower
--- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
--- #else
--- fromHandle h = go where
--- go = do txt <- liftIO (T.hGetChunk h)
--- unless (T.null txt) $ do yield txt
--- go
--- {-# INLINABLE fromHandle#-}
--- #endif
+
{-| Stream text from a file using Pipes.Safe
>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
MAIN = PUTSTRLN "HELLO WORLD"
-}
-readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
+readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
readFile file = Safe.withFile file IO.ReadMode fromHandle
{-# INLINABLE readFile #-}
-- | Stream text into a file. Uses @pipes-safe@.
-writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m ()
+writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
writeFile file = Safe.withFile file IO.WriteMode toHandle
-- | Apply a transformation to each 'Char' in the stream
-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
-- into a Pipe of Text
-decodeUtf8
- :: Monad m
- => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8 = decodeUtf8With Nothing
-{-# INLINEABLE decodeUtf8 #-}
-
--- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
--- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
--- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
-decodeUtf8With
- :: Monad m
- => Maybe TE.OnDecodeError
- -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
- go dec old p = do
- x <- lift (next p)
- case x of
- Left r -> if B.null old then return (return r)
- else return (do yield old
- return r)
- Right (chunk, p') ->
- case dec chunk of
- PE.Some text l dec' ->
- if T.null text then go dec' (B.append old l) p'
- else do yield text
- go dec' B.empty p'
- PE.Other text bs ->
- if T.null text then return (do yield old
- yield bs
- p')
- else do yield text
- return (do yield bs
- p')
-{-# INLINEABLE decodeUtf8With #-}
-
+decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8 = go PE.streamDecodeUtf8 where
+ go dec0 p = do
+ x <- lift (next p)
+ case x of Left r -> return (return r)
+ Right (chunk, p') ->
+ case dec0 chunk of PE.Some text _ dec -> do yield text
+ go dec p'
+ PE.Other text bs -> do yield text
+ return (do yield bs
+ p')
-- | Splits a 'Producer' after the given number of characters
splitAt
{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
UnliftedFFITypes #-}
--- This module lifts material from Brian O'Sullivan's text package
+-- This module lifts assorted materials from Brian O'Sullivan's text package
-- especially Data.Text.Encoding in order to define a pipes-appropriate
-- streamDecodeUtf8
module Pipes.Text.Internal
( Decoding(..)
- , streamDecodeUtf8With
, streamDecodeUtf8
) where
import Data.ByteString as B
import Data.ByteString.Internal as B
import Data.Text ()
+import qualified Data.Text as T
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, textP)
import Data.Word (Word8, Word32)
newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
--- encoded text that is known to be valid.
---
--- If the input contains any invalid UTF-8 data, an exception will be
--- thrown (either by this function or a continuation) that cannot be
--- caught in pure code. For more control over the handling of invalid
--- data, use 'streamDecodeUtf8With'.
streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
+streamDecodeUtf8 = decodeChunk B.empty 0 0
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
--- encoded text.
-streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
-streamDecodeUtf8With mErr = case mErr of
- Nothing -> decodeWith False strictDecode
- Just onErr -> decodeWith True onErr
- where
- -- We create a slightly larger than necessary buffer to accommodate a
- -- potential surrogate pair started in the last buffer
- decodeWith replace onErr = decodeChunk 0 0
- where
- decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
- decodeChunk codepoint0 state0 bs@(PS fp off len) =
- runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+decodeChunk old codepoint0 state0 bs@(PS fp off len) =
+ runST $ do marray <- A.new (len+1)
+ unsafeIOToST (decodeChunkToBuffer marray)
where
- decodeChunkToBuffer :: A.MArray s -> IO Decoding
- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
- with (0::CSize) $ \destOffPtr ->
- with codepoint0 $ \codepointPtr ->
- with state0 $ \statePtr ->
- with nullPtr $ \curPtrPtr ->
- let end = ptr `plusPtr` (off + len)
- loop curPtr = do
- poke curPtrPtr curPtr
- curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
- curPtrPtr end codepointPtr statePtr
- state <- peek statePtr
- case state of
- UTF8_REJECT ->
- -- We encountered an encoding error
- if replace
- then do
- x <- peek curPtr'
- case onErr desc (Just x) of
- Nothing -> loop $ curPtr' `plusPtr` 1
- Just c -> do
- destOff <- peek destOffPtr
- w <- unsafeSTToIO $
- unsafeWrite dest (fromIntegral destOff) (safe c)
- poke destOffPtr (destOff + fromIntegral w)
- poke statePtr 0
- loop $ curPtr' `plusPtr` 1
- else do
- n <- peek destOffPtr
- chunkText <- unsafeSTToIO $ do
- arr <- A.unsafeFreeze dest
- return $! textP arr 0 (fromIntegral n)
- lastPtr <- peek curPtrPtr
- let left = lastPtr `minusPtr` curPtr
- return $ Other chunkText (B.drop left bs)
- _ -> do
- -- We encountered the end of the buffer while decoding
- n <- peek destOffPtr
- codepoint <- peek codepointPtr
- chunkText <- unsafeSTToIO $ do
- arr <- A.unsafeFreeze dest
- return $! textP arr 0 (fromIntegral n)
- lastPtr <- peek curPtrPtr
- let left = lastPtr `minusPtr` curPtr
- return $ Some chunkText (B.drop left bs)
- (decodeChunk codepoint state)
- in loop (ptr `plusPtr` off)
- desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
+ decodeChunkToBuffer :: A.MArray s -> IO Decoding
+ decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with codepoint0 $ \codepointPtr ->
+ with state0 $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
+ do let end = ptr `plusPtr` (off + len)
+ curPtr = ptr `plusPtr` off
+ poke curPtrPtr curPtr
+ c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+ state <- peek statePtr
+ lastPtr <- peek curPtrPtr
+ codepoint <- peek codepointPtr
+ n <- peek destOffPtr
+ chunkText <- mkText dest n
+ let left = lastPtr `minusPtr` curPtr
+ remaining = B.drop left bs
+ accum = if T.null chunkText then B.append old remaining else remaining
+ return $ case state of
+ UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+ _ -> Some chunkText accum (decodeChunk accum codepoint state)
+
+
+mkText :: A.MArray s -> CSize -> IO Text
+mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+ return $! textP arr 0 (fromIntegral n)
ord :: Char -> Int
ord (C# c#) = I# (ord# c#)
{-# INLINE ord #-}
-
unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
unsafeWrite marr i c
- | n < 0x10000 = do
- A.unsafeWrite marr i (fromIntegral n)
- return 1
- | otherwise = do
- A.unsafeWrite marr i lo
- A.unsafeWrite marr (i+1) hi
- return 2
+ | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
+ return 1
+ | otherwise = do A.unsafeWrite marr i lo
+ A.unsafeWrite marr (i+1) hi
+ return 2
where n = ord c
m = n - 0x10000
lo = fromIntegral $ (m `shiftR` 10) + 0xD800
import Control.Exception (catch)
import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
import Data.Monoid (Monoid(..))
+import Control.Monad
import Data.String (fromString)
import Data.Text.Encoding.Error
import qualified Data.List as L
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as E
import qualified Pipes.Text.Internal as PE
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP
+import qualified Pipes as P
+
+import Debug.Trace
main :: IO ()
main = defaultMain [tests]
-- >>> :main -a 10000
tests = testGroup "stream_decode" [
-
- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
- testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed]
+ -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
+ testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed,
+ testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
t_utf8_incr_valid = do
Positive n <- arbitrary
PE.Some t _ f' -> t : feedChunksOf n f' b
_ -> []
-t_utf8_incr_mixed = do
- Positive n <- arbitrary
+t_utf8_incr_mixed = do
+ Positive n <- arbitrary
txt <- genUnicode
- forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt
+ let chunkSize = mod n 7 + 1
+ forAll (vector 9) $
+ (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
where
roundtrip :: [B.ByteString] -> B.ByteString
- roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where
- go dec acc old [] = acc <> old
- go dec acc old (bs:bss) = case dec bs of
- PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss
- else go dec' (acc <> E.encodeUtf8 t) new bss
- PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss
- else acc <> E.encodeUtf8 t <> bs' <> B.concat bss
+ roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where
+ go dec acc [] = acc
+ go dec acc [bs] = case dec bs of
+ PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
+ PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs'
+ go dec acc (bs:bss) = case dec bs of
+ PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
+ PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
+ chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
+ appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+
+
+
+
+t_utf8_incr_pipe = do
+ Positive m <- arbitrary
+ Positive n <- arbitrary
+ txt <- genUnicode
+ let chunkSize = mod n 7 + 1
+ bytesLength = mod 20 m
+ forAll (vector bytesLength) $
+ (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt)
+ `eq`
+ appendBytes txt
+ where
+ roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
+ roundtrip p = do pbs <- TP.decodeUtf8 p P.>-> TP.encodeUtf8
+ pbs
chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+
+
+
+
+