X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=Pipes%2FText.hs;h=f2b4aacf5e09892d010ab1273e39b1b2ea3f9bb8;hb=b4d21c025864357dab2b0df07091d953c6da98f3;hp=1092491092d25b078b2ef4299b7e70135dbbae49;hpb=acc6868f63bdbede411874f4cfdbbb2d4bfa41da;p=github%2Ffretlink%2Ftext-pipes.git diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 1092491..f2b4aac 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -2,9 +2,10 @@ {-| This module provides @pipes@ utilities for \"text streams\", which are streams of 'Text' chunks. The individual chunks are uniformly @strict@, but - can interact lazy 'Text's and 'IO.Handle's. + a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can + be associated with a 'Producer' or 'Consumer' according as it is read or written to. - To stream to or from 'IO.Handle's, use 'fromHandle' or 'toHandle'. For + To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For example, the following program copies a document from one file to another: > import Pipes @@ -80,6 +81,10 @@ module Pipes.Text ( filter, scan, encodeUtf8, +#if MIN_VERSION_text(0,11,4) + pipeDecodeUtf8, + pipeDecodeUtf8With, +#endif pack, unpack, toCaseFold, @@ -101,8 +106,6 @@ module Pipes.Text ( minimum, find, index, --- elemIndex, --- findIndex, count, -- * Splitters @@ -118,6 +121,7 @@ module Pipes.Text ( words, #if MIN_VERSION_text(0,11,4) decodeUtf8, + decodeUtf8With, #endif -- * Transformations intersperse, @@ -147,6 +151,7 @@ import Control.Monad.Trans.State.Strict (StateT(..)) import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Data.Text.Encoding as TE +import qualified Data.Text.Encoding.Error as TE import Data.Text (Text) import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.IO as TL @@ -154,7 +159,7 @@ import Data.Text.Lazy.Internal (foldrChunks, defaultChunkSize) import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) import Data.ByteString (ByteString) import qualified Data.ByteString as B -import Data.Char (ord) +import Data.Char (ord, isSpace) import Data.Functor.Identity (Identity) import qualified Data.List as List import Foreign.C.Error (Errno(Errno), ePIPE) @@ -172,6 +177,7 @@ import Pipes.Safe (MonadSafe(..), Base(..)) import qualified Pipes.Prelude as P import qualified System.IO as IO import Data.Char (isSpace) +import Data.Word (Word8) import Prelude hiding ( all, any, @@ -316,7 +322,8 @@ concatMap f = P.map (T.concatMap f) -- | Transform a Pipe of 'Text' into a Pipe of 'ByteString's using UTF-8 --- encoding +-- encoding; @encodeUtf8 = Pipes.Prelude.map TE.encodeUtf8@ so more complex +-- encoding pipes can easily be constructed with the functions in @Data.Text.Encoding@ encodeUtf8 :: Monad m => Pipe Text ByteString m r encodeUtf8 = P.map TE.encodeUtf8 {-# INLINEABLE encodeUtf8 #-} @@ -542,19 +549,7 @@ index index n p = head (p >-> drop n) {-# INLINABLE index #-} --- | Find the index of an element that matches the given 'Char' --- elemIndex --- :: (Monad m, Num n) => Char -> Producer Text m () -> m (Maybe n) --- elemIndex w8 = findIndex (w8 ==) --- {-# INLINABLE elemIndex #-} - --- | Store the first index of an element that satisfies the predicate --- findIndex --- :: (Monad m, Num n) --- => (Char -> Bool) -> Producer Text m () -> m (Maybe n) --- findIndex predicate p = P.head (p >-> findIndices predicate) --- {-# INLINABLE findIndex #-} --- + -- | Store a tally of how many segments match the given 'Text' count :: (Monad m, Num n) => Text -> Producer Text m () -> m n count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) @@ -581,6 +576,52 @@ decodeUtf8 = go TE.streamDecodeUtf8 yield l p' {-# INLINEABLE decodeUtf8 #-} + +-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded +-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ +-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" +decodeUtf8With + :: Monad m + => TE.OnDecodeError + -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) + where go dec p = do + x <- lift (next p) + case x of + Left r -> return (return r) + Right (chunk, p') -> do + let TE.Some text l dec' = dec chunk + if B.null l + then do + yield text + go dec' p' + else return $ do + yield l + p' +{-# INLINEABLE decodeUtf8With #-} + +-- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise +-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise +-- only few bytes will only be moved from one chunk to the next before decoding. +pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r +pipeDecodeUtf8 = go TE.streamDecodeUtf8 + where go dec = do chunk <- await + case dec chunk of + TE.Some text l dec' -> do yield text + go dec' +{-# INLINEABLE pipeDecodeUtf8 #-} + +-- | A simple pipe from 'ByteString' to 'Text' using a replacement function. +pipeDecodeUtf8With + :: Monad m + => TE.OnDecodeError + -> Pipe ByteString Text m r +pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) + where go dec = do chunk <- await + case dec chunk of + TE.Some text l dec' -> do yield text + go dec' +{-# INLINEABLE pipeDecodeUtf8With #-} #endif -- | Splits a 'Producer' after the given number of characters @@ -740,12 +781,11 @@ lines p0 = PP.FreeT (go0 p0) else return $ PP.Free $ go1 (yield txt >> p') go1 p = do p' <- break ('\n' ==) p - return $ PP.FreeT (go2 p') - go2 p = do - x <- nextChar p - return $ case x of - Left r -> PP.Pure r - Right (_, p') -> PP.Free (go1 p') + return $ PP.FreeT $ do + x <- nextChar p' + case x of + Left r -> return $ PP.Pure r + Right (_, p'') -> go0 p'' {-# INLINABLE lines #-} @@ -753,26 +793,18 @@ lines p0 = PP.FreeT (go0 p0) -- | Split a text stream into 'FreeT'-delimited words words :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r -words p0 = removeEmpty (splitWith isSpace p0) +words = go where - removeEmpty f = PP.FreeT $ do - x <- PP.runFreeT f - case x of - PP.Pure r -> return (PP.Pure r) - PP.Free p -> loop p - loop p = do - y <- next p - case y of - Left f' -> PP.runFreeT (removeEmpty f') - Right (txt, p') -> - if T.null txt - then loop p' - else return $ PP.Free $ do - yield txt - f' <- p' - return (removeEmpty f') + go p = PP.FreeT $ do + x <- next (p >-> dropWhile isSpace) + return $ case x of + Left r -> PP.Pure r + Right (bs, p') -> PP.Free $ do + p'' <- break isSpace (yield bs >> p') + return (go p'') {-# INLINABLE words #-} + -- | Intersperse a 'Char' in between the characters of the text stream intersperse :: (Monad m) => Char -> Producer Text m r -> Producer Text m r