X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=Pipes%2FText.hs;h=f2b4aacf5e09892d010ab1273e39b1b2ea3f9bb8;hb=b4d21c025864357dab2b0df07091d953c6da98f3;hp=022855f9a043ddd550ea03bcf57a4e3bef730dc8;hpb=c0343bc936515472b7218f7f8d6d0f4cf4e3d1c1;p=github%2Ffretlink%2Ftext-pipes.git diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 022855f..f2b4aac 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -2,9 +2,10 @@ {-| This module provides @pipes@ utilities for \"text streams\", which are streams of 'Text' chunks. The individual chunks are uniformly @strict@, but - can interact lazy 'Text's and 'IO.Handle's. + a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can + be associated with a 'Producer' or 'Consumer' according as it is read or written to. - To stream to or from 'IO.Handle's, use 'fromHandle' or 'toHandle'. For + To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For example, the following program copies a document from one file to another: > import Pipes @@ -80,8 +81,15 @@ module Pipes.Text ( filter, scan, encodeUtf8, +#if MIN_VERSION_text(0,11,4) + pipeDecodeUtf8, + pipeDecodeUtf8With, +#endif pack, unpack, + toCaseFold, + toLower, + toUpper, stripStart, -- * Folds @@ -98,8 +106,6 @@ module Pipes.Text ( minimum, find, index, --- elemIndex, --- findIndex, count, -- * Splitters @@ -115,6 +121,7 @@ module Pipes.Text ( words, #if MIN_VERSION_text(0,11,4) decodeUtf8, + decodeUtf8With, #endif -- * Transformations intersperse, @@ -140,10 +147,11 @@ module Pipes.Text ( import Control.Exception (throwIO, try) import Control.Monad (liftM, unless) -import Control.Monad.Trans.State.Strict (StateT) +import Control.Monad.Trans.State.Strict (StateT(..)) import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Data.Text.Encoding as TE +import qualified Data.Text.Encoding.Error as TE import Data.Text (Text) import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.IO as TL @@ -151,7 +159,7 @@ import Data.Text.Lazy.Internal (foldrChunks, defaultChunkSize) import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) import Data.ByteString (ByteString) import qualified Data.ByteString as B -import Data.Char (ord) +import Data.Char (ord, isSpace) import Data.Functor.Identity (Identity) import qualified Data.List as List import Foreign.C.Error (Errno(Errno), ePIPE) @@ -169,6 +177,7 @@ import Pipes.Safe (MonadSafe(..), Base(..)) import qualified Pipes.Prelude as P import qualified System.IO as IO import Data.Char (isSpace) +import Data.Word (Word8) import Prelude hiding ( all, any, @@ -203,7 +212,7 @@ fromLazy :: (Monad m) => TL.Text -> Producer' Text m () fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) {-# INLINABLE fromLazy #-} --- | Stream bytes from 'stdin' +-- | Stream text from 'stdin' stdin :: MonadIO m => Producer' Text m () stdin = fromHandle IO.stdin {-# INLINABLE stdin #-} @@ -313,7 +322,8 @@ concatMap f = P.map (T.concatMap f) -- | Transform a Pipe of 'Text' into a Pipe of 'ByteString's using UTF-8 --- encoding +-- encoding; @encodeUtf8 = Pipes.Prelude.map TE.encodeUtf8@ so more complex +-- encoding pipes can easily be constructed with the functions in @Data.Text.Encoding@ encodeUtf8 :: Monad m => Pipe Text ByteString m r encodeUtf8 = P.map TE.encodeUtf8 {-# INLINEABLE encodeUtf8 #-} @@ -486,7 +496,7 @@ null :: (Monad m) => Producer Text m () -> m Bool null = P.all T.null {-# INLINABLE null #-} --- | Count the number of bytes +-- | Count the number of characters in the stream length :: (Monad m, Num n) => Producer Text m () -> m n length = P.fold (\n txt -> n + fromIntegral (T.length txt)) 0 id {-# INLINABLE length #-} @@ -501,7 +511,7 @@ all :: (Monad m) => (Char -> Bool) -> Producer Text m () -> m Bool all predicate = P.all (T.all predicate) {-# INLINABLE all #-} --- | Return the maximum 'Char' within a byte stream +-- | Return the maximum 'Char' within a text stream maximum :: (Monad m) => Producer Text m () -> m (Maybe Char) maximum = P.fold step Nothing id where @@ -513,7 +523,7 @@ maximum = P.fold step Nothing id Just c -> max c (T.maximum txt) {-# INLINABLE maximum #-} --- | Return the minimum 'Char' within a byte stream +-- | Return the minimum 'Char' within a text stream (surely very useful!) minimum :: (Monad m) => Producer Text m () -> m (Maybe Char) minimum = P.fold step Nothing id where @@ -532,26 +542,14 @@ find find predicate p = head (p >-> filter predicate) {-# INLINABLE find #-} --- | Index into a byte stream +-- | Index into a text stream index :: (Monad m, Integral a) => a-> Producer Text m () -> m (Maybe Char) index n p = head (p >-> drop n) {-# INLINABLE index #-} --- | Find the index of an element that matches the given 'Char' --- elemIndex --- :: (Monad m, Num n) => Char -> Producer Text m () -> m (Maybe n) --- elemIndex w8 = findIndex (w8 ==) --- {-# INLINABLE elemIndex #-} - --- | Store the first index of an element that satisfies the predicate --- findIndex --- :: (Monad m, Num n) --- => (Char -> Bool) -> Producer Text m () -> m (Maybe n) --- findIndex predicate p = P.head (p >-> findIndices predicate) --- {-# INLINABLE findIndex #-} --- + -- | Store a tally of how many segments match the given 'Text' count :: (Monad m, Num n) => Text -> Producer Text m () -> m n count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) @@ -578,6 +576,52 @@ decodeUtf8 = go TE.streamDecodeUtf8 yield l p' {-# INLINEABLE decodeUtf8 #-} + +-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded +-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ +-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" +decodeUtf8With + :: Monad m + => TE.OnDecodeError + -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) + where go dec p = do + x <- lift (next p) + case x of + Left r -> return (return r) + Right (chunk, p') -> do + let TE.Some text l dec' = dec chunk + if B.null l + then do + yield text + go dec' p' + else return $ do + yield l + p' +{-# INLINEABLE decodeUtf8With #-} + +-- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise +-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise +-- only few bytes will only be moved from one chunk to the next before decoding. +pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r +pipeDecodeUtf8 = go TE.streamDecodeUtf8 + where go dec = do chunk <- await + case dec chunk of + TE.Some text l dec' -> do yield text + go dec' +{-# INLINEABLE pipeDecodeUtf8 #-} + +-- | A simple pipe from 'ByteString' to 'Text' using a replacement function. +pipeDecodeUtf8With + :: Monad m + => TE.OnDecodeError + -> Pipe ByteString Text m r +pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) + where go dec = do chunk <- await + case dec chunk of + TE.Some text l dec' -> do yield text + go dec' +{-# INLINEABLE pipeDecodeUtf8With #-} #endif -- | Splits a 'Producer' after the given number of characters @@ -645,8 +689,8 @@ span predicate = go return (yield suffix >> p') {-# INLINABLE span #-} -{-| Split a byte stream in two, where the first byte stream is the longest - consecutive group of bytes that don't satisfy the predicate +{-| Split a text stream in two, where the first text stream is the longest + consecutive group of characters that don't satisfy the predicate -} break :: (Monad m) @@ -656,7 +700,7 @@ break break predicate = span (not . predicate) {-# INLINABLE break #-} -{-| Split a byte stream into sub-streams delimited by bytes that satisfy the +{-| Split a text stream into sub-streams delimited by characters that satisfy the predicate -} splitWith @@ -693,7 +737,7 @@ split :: (Monad m) split c = splitWith (c ==) {-# INLINABLE split #-} -{-| Group a text stream into 'FreeT'-delimited byte streams using the supplied +{-| Group a text stream into 'FreeT'-delimited text streams using the supplied equality predicate -} groupBy @@ -715,17 +759,13 @@ groupBy equal p0 = PP.FreeT (go p0) return $ PP.FreeT (go p'') {-# INLINABLE groupBy #-} --- | Group a byte stream into 'FreeT'-delimited byte streams of identical bytes +-- | Group a text stream into 'FreeT'-delimited text streams of identical characters group :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r group = groupBy (==) {-# INLINABLE group #-} -{-| Split a byte stream into 'FreeT'-delimited lines - - Note: This function is purely for demonstration purposes since it assumes a - particular encoding. You should prefer the 'Data.Text.Text' equivalent of - this function from the upcoming @pipes-text@ library. +{-| Split a text stream into 'FreeT'-delimited lines -} lines :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r @@ -741,12 +781,11 @@ lines p0 = PP.FreeT (go0 p0) else return $ PP.Free $ go1 (yield txt >> p') go1 p = do p' <- break ('\n' ==) p - return $ PP.FreeT (go2 p') - go2 p = do - x <- nextChar p - return $ case x of - Left r -> PP.Pure r - Right (_, p') -> PP.Free (go1 p') + return $ PP.FreeT $ do + x <- nextChar p' + case x of + Left r -> return $ PP.Pure r + Right (_, p'') -> go0 p'' {-# INLINABLE lines #-} @@ -754,23 +793,19 @@ lines p0 = PP.FreeT (go0 p0) -- | Split a text stream into 'FreeT'-delimited words words :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r -words p0 = removeEmpty (splitWith isSpace p0) +words = go where - removeEmpty f = PP.FreeT $ do - x <- PP.runFreeT f - case x of - PP.Pure r -> return (PP.Pure r) - PP.Free p -> do - y <- next p - case y of - Left f' -> PP.runFreeT (removeEmpty f') - Right (bs, p') -> return $ PP.Free $ do - yield bs - f' <- p' - return (removeEmpty f') + go p = PP.FreeT $ do + x <- next (p >-> dropWhile isSpace) + return $ case x of + Left r -> PP.Pure r + Right (bs, p') -> PP.Free $ do + p'' <- break isSpace (yield bs >> p') + return (go p'') {-# INLINABLE words #-} --- | Intersperse a 'Char' in between the bytes of the byte stream + +-- | Intersperse a 'Char' in between the characters of the text stream intersperse :: (Monad m) => Char -> Producer Text m r -> Producer Text m r intersperse c = go0 @@ -819,7 +854,7 @@ intercalate p0 = go0 go1 f' {-# INLINABLE intercalate #-} -{-| Join 'FreeT'-delimited lines into a byte stream +{-| Join 'FreeT'-delimited lines into a text stream -} unlines :: (Monad m) => FreeT (Producer Text m) m r -> Producer Text m r