X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=Pipes%2FText.hs;h=58b9c26d2158fa78eb04871dba25866e62b7fc72;hb=79917d53aa8a1e2c8332e330337f74440859306d;hp=bdd706a8bf5f52e7188f819c5445d37112ec7202;hpb=63ea9ffd3b32d1b4816e5b1e183d942df3d0de33;p=github%2Ffretlink%2Ftext-pipes.git diff --git a/Pipes/Text.hs b/Pipes/Text.hs index bdd706a..58b9c26 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -1,179 +1,125 @@ -{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-} +{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns, Trustworthy #-} -{-| This module provides @pipes@ utilities for \"text streams\", which are - streams of 'Text' chunks. The individual chunks are uniformly @strict@, but - a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can - be associated with a 'Producer' or 'Consumer' according as it is read or written to. - - To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For - example, the following program copies a document from one file to another: - -> import Pipes -> import qualified Data.Text.Pipes as Text -> import System.IO -> -> main = -> withFile "inFile.txt" ReadMode $ \hIn -> -> withFile "outFile.txt" WriteMode $ \hOut -> -> runEffect $ Text.fromHandle hIn >-> Text.toHandle hOut - -To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): - -> import Pipes -> import qualified Data.Text.Pipes as Text -> import Pipes.Safe -> -> main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" - - You can stream to and from 'stdin' and 'stdout' using the predefined 'stdin' - and 'stdout' proxies, as with the following \"echo\" program: - -> main = runEffect $ Text.stdin >-> Text.stdout - - You can also translate pure lazy 'TL.Text's to and from proxies: - -> main = runEffect $ Text.fromLazy (TL.pack "Hello, world!\n") >-> Text.stdout - - In addition, this module provides many functions equivalent to lazy - 'Text' functions so that you can transform or fold text streams. For - example, to stream only the first three lines of 'stdin' to 'stdout' you - might write: - -> import Pipes -> import qualified Pipes.Text as Text -> import qualified Pipes.Parse as Parse -> -> main = runEffect $ takeLines 3 Text.stdin >-> Text.stdout -> where -> takeLines n = Text.unlines . Parse.takeFree n . Text.lines - - The above program will never bring more than one chunk of text (~ 32 KB) into - memory, no matter how long the lines are. - - Note that functions in this library are designed to operate on streams that - are insensitive to text boundaries. This means that they may freely split - text into smaller texts and /discard empty texts/. However, they will - /never concatenate texts/ in order to provide strict upper bounds on memory - usage. --} module Pipes.Text ( - -- * Producers - fromLazy, - stdin, - fromHandle, - readFile, - stdinLn, + -- * Effectful Text + -- $intro + + -- * Lenses + -- $lenses + + -- ** @view@ \/ @(^.)@ + -- $view - -- * Consumers - stdout, - stdoutLn, - toHandle, - writeFile, + -- ** @over@ \/ @(%~)@ + -- $over + + -- ** @zoom@ + -- $zoom + + -- * Special types: @Producer Text m (Producer Text m r)@ and @FreeT (Producer Text m) m r@ + -- $special + + -- * Producers + fromLazy -- * Pipes - map, - concatMap, - take, - drop, - takeWhile, - dropWhile, - filter, - scan, - encodeUtf8, - pack, - unpack, - toCaseFold, - toLower, - toUpper, - stripStart, + , map + , concatMap + , take + , drop + , takeWhile + , dropWhile + , filter + , scan + , pack + , unpack + , toCaseFold + , toLower + , toUpper + , stripStart -- * Folds - toLazy, - toLazyM, - fold, - head, - last, - null, - length, - any, - all, - maximum, - minimum, - find, - index, - count, + , toLazy + , toLazyM + , foldChars + , head + , last + , null + , length + , any + , all + , maximum + , minimum + , find + , index + , count + + -- * Primitive Character Parsers + , nextChar + , drawChar + , unDrawChar + , peekChar + , isEndOfChars + + -- * Parsing Lenses + , splitAt + , span + , break + , groupBy + , group + , word + , line + + -- * FreeT Splitters + , chunksOf + , splitsWith + , splits + , groupsBy + , groups + , lines + , words - -- * Splitters - splitAt, - chunksOf, - span, - break, - splitWith, - split, - groupBy, - group, - lines, - words, -#if MIN_VERSION_text(0,11,4) - decodeUtf8, - decodeUtf8With, -#endif -- * Transformations - intersperse, + , intersperse + , packChars -- * Joiners - intercalate, - unlines, - unwords, - - -- * Character Parsers - -- $parse - nextChar, - drawChar, - unDrawChar, - peekChar, - isEndOfChars, + , intercalate + , unlines + , unwords -- * Re-exports -- $reexports - module Data.Text, - module Pipes.Parse + , module Data.ByteString + , module Data.Text + , module Data.Profunctor + , module Pipes.Parse + , module Pipes.Group ) where -import Control.Exception (throwIO, try) -import Control.Monad (liftM, unless) -import Control.Monad.Trans.State.Strict (StateT(..)) +import Control.Applicative ((<*)) +import Control.Monad (liftM, join) +import Control.Monad.Trans.State.Strict (StateT(..), modify) import qualified Data.Text as T -import qualified Data.Text.IO as T -import qualified Data.Text.Encoding as TE -import qualified Data.Text.Encoding.Error as TE import Data.Text (Text) import qualified Data.Text.Lazy as TL -import qualified Data.Text.Lazy.IO as TL -import Data.Text.Lazy.Internal (foldrChunks, defaultChunkSize) -import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) import Data.ByteString (ByteString) -import qualified Data.ByteString as B -import Data.Char (ord) +import Data.Functor.Constant (Constant(Constant, getConstant)) import Data.Functor.Identity (Identity) -import qualified Data.List as List -import Foreign.C.Error (Errno(Errno), ePIPE) -import qualified GHC.IO.Exception as G +import Data.Profunctor (Profunctor) +import qualified Data.Profunctor import Pipes -import qualified Pipes.ByteString.Parse as PBP -import Pipes.Text.Parse ( - nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) -import Pipes.Core (respond, Server') +import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) +import qualified Pipes.Group as PG import qualified Pipes.Parse as PP -import Pipes.Parse (input, concat, FreeT) -import qualified Pipes.Safe.Prelude as Safe -import qualified Pipes.Safe as Safe -import Pipes.Safe (MonadSafe(..), Base(..)) +import Pipes.Parse (Parser) +import Pipes.Text.Encoding (Lens'_, Iso'_) import qualified Pipes.Prelude as P -import qualified System.IO as IO import Data.Char (isSpace) import Data.Word (Word8) +import Foreign.Storable (sizeOf) +import Data.Bits (shiftL) import Prelude hiding ( all, any, @@ -203,152 +149,352 @@ import Prelude hiding ( words, writeFile ) --- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's -fromLazy :: (Monad m) => TL.Text -> Producer' Text m () -fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) -{-# INLINABLE fromLazy #-} +{- $intro + This package provides @pipes@ utilities for /text streams/ or /character streams/, + realized as streams of 'Text' chunks. The individual chunks are uniformly /strict/, + and thus you will generally want @Data.Text@ in scope. But the type + @Producer Text m r@ ,as we are using it, is a sort of /pipes/ equivalent of the lazy @Text@ type. + + This particular module provides many functions equivalent in one way or another to + the pure functions in + . + They transform, divide, group and fold text streams. Though @Producer Text m r@ + is the type of \'effectful Text\', the functions in this module are \'pure\' + in the sense that they are uniformly monad-independent. + Simple /IO/ operations are defined in @Pipes.Text.IO@ -- as lazy IO @Text@ + operations are in @Data.Text.Lazy.IO@. Inter-operation with @ByteString@ + is provided in @Pipes.Text.Encoding@, which parallels @Data.Text.Lazy.Encoding@. + + The Text type exported by @Data.Text.Lazy@ is basically that of a lazy list of + strict Text: the implementation is arranged so that the individual strict 'Text' + chunks are kept to a reasonable size; the user is not aware of the divisions + between the connected 'Text' chunks. + So also here: the functions in this module are designed to operate on streams that + are insensitive to text boundaries. This means that they may freely split + text into smaller texts and /discard empty texts/. The objective, though, is + that they should /never concatenate texts/ in order to provide strict upper + bounds on memory usage. + + For example, to stream only the first three lines of 'stdin' to 'stdout' you + might write: --- | Stream text from 'stdin' -stdin :: MonadIO m => Producer' Text m () -stdin = fromHandle IO.stdin -{-# INLINABLE stdin #-} +> import Pipes +> import qualified Pipes.Text as Text +> import qualified Pipes.Text.IO as Text +> import Pipes.Group (takes') +> import Lens.Family +> +> main = runEffect $ takeLines 3 Text.stdin >-> Text.stdout +> where +> takeLines n = Text.unlines . takes' n . view Text.lines -{-| Convert a 'IO.Handle' into a text stream using a text size - determined by the good sense of the text library. + The above program will never bring more than one chunk of text (~ 32 KB) into + memory, no matter how long the lines are. -} +{- $lenses + As this example shows, one superficial difference from @Data.Text.Lazy@ + is that many of the operations, like 'lines', are \'lensified\'; this has a + number of advantages (where it is possible); in particular it facilitates their + use with 'Parser's of Text (in the general + sense.) The disadvantage, famously, is that the messages you get for type errors can be + a little alarming. The remarks that follow in this section are for non-lens adepts. + + Each lens exported here, e.g. 'lines', 'chunksOf' or 'splitAt', reduces to the + intuitively corresponding function when used with @view@ or @(^.)@. Instead of + writing: + + > splitAt 17 producer + + as we would with the Prelude or Text functions, we write + + > view (splitAt 17) producer + + or equivalently + + > producer ^. splitAt 17 + + This may seem a little indirect, but note that many equivalents of + @Text -> Text@ functions are exported here as 'Pipe's. Here too we recover the intuitively + corresponding functions by prefixing them with @(>->)@. Thus something like -fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () -fromHandle h = go where - go = do txt <- liftIO (T.hGetChunk h) - unless (T.null txt) $ do yield txt - go -{-# INLINABLE fromHandle#-} +> stripLines = Text.unlines . Group.maps (>-> Text.stripStart) . view Text.lines -{-| Stream text from a file using Pipes.Safe + would drop the leading white space from each line. + + The lenses in this library are marked as /improper/; this just means that + they don't admit all the operations of an ideal lens, but only /getting/ and /focusing/. + Just for this reason, though, the magnificent complexities of the lens libraries + are a distraction. The lens combinators to keep in mind, the ones that make sense for + our lenses, are @view@ \/ @(^.)@), @over@ \/ @(%~)@ , and @zoom@. + + One need only keep in mind that if @l@ is a @Lens'_ a b@, then: ->>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout -MAIN = PUTSTRLN "HELLO WORLD" -} +{- $view + @view l@ is a function @a -> b@ . Thus @view l a@ (also written @a ^. l@ ) + is the corresponding @b@; as was said above, this function will be exactly the + function you think it is, given its name. Thus to uppercase the first n characters + of a Producer, leaving the rest the same, we could write: -readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () -readFile file = Safe.withFile file IO.ReadMode fromHandle -{-# INLINABLE readFile #-} -{-| Stream lines of text from stdin (for testing in ghci etc.) + > upper n p = do p' <- p ^. Text.splitAt n >-> Text.toUpper + > p' +-} +{- $over + @over l@ is a function @(b -> b) -> a -> a@. Thus, given a function that modifies + @b@s, the lens lets us modify an @a@ by applying @f :: b -> b@ to + the @b@ that we can \"see\" through the lens. So @over l f :: a -> a@ + (it can also be written @l %~ f@). + For any particular @a@, then, @over l f a@ or @(l %~ f) a@ is a revised @a@. + So above we might have written things like these: + + > stripLines = Text.lines %~ maps (>-> Text.stripStart) + > stripLines = over Text.lines (maps (>-> Text.stripStart)) + > upper n = Text.splitAt n %~ (>-> Text.toUpper) ->>> let safely = runSafeT . runEffect ->>> safely $ for Text.stdinLn (lift . lift . print . T.length) -hello -5 -world -5 +-} +{- $zoom + @zoom l@, finally, is a function from a @Parser b m r@ + to a @Parser a m r@ (or more generally a @StateT (Producer b m x) m r@). + Its use is easiest to see with an decoding lens like 'utf8', which + \"sees\" a Text producer hidden inside a ByteString producer: + @drawChar@ is a Text parser, returning a @Maybe Char@, @zoom utf8 drawChar@ is + a /ByteString/ parser, returning a @Maybe Char@. @drawAll@ is a Parser that returns + a list of everything produced from a Producer, leaving only the return value; it would + usually be unreasonable to use it. But @zoom (splitAt 17) drawAll@ + returns a list of Text chunks containing the first seventeen Chars, and returns the rest of + the Text Producer for further parsing. Suppose that we want, inexplicably, to + modify the casing of a Text Producer according to any instruction it might + contain at the start. Then we might write something like this: + +> obey :: Monad m => Producer Text m b -> Producer Text m b +> obey p = do (ts, p') <- lift $ runStateT (zoom (Text.splitAt 7) drawAll) p +> let seven = T.concat ts +> case T.toUpper seven of +> "TOUPPER" -> p' >-> Text.toUpper +> "TOLOWER" -> p' >-> Text.toLower +> _ -> do yield seven +> p' + + +> >>> let doc = each ["toU","pperTh","is document.\n"] +> >>> runEffect $ obey doc >-> Text.stdout +> THIS DOCUMENT. + + The purpose of exporting lenses is the mental economy achieved with this three-way + applicability. That one expression, e.g. @lines@ or @splitAt 17@ can have these + three uses is no more surprising than that a pipe can act as a function modifying + the output of a producer, namely by using @>->@ to its left: @producer >-> pipe@ + -- but can /also/ modify the inputs to a consumer by using @>->@ to its right: + @pipe >-> consumer@ + + The three functions, @view@ \/ @(^.)@, @over@ \/ @(%~)@ and @zoom@ are supplied by + both and + The use of 'zoom' is explained + in + and to some extent in the @Pipes.Text.Encoding@ module here. -} -stdinLn :: MonadIO m => Producer' Text m () -stdinLn = go where - go = do - eof <- liftIO (IO.hIsEOF IO.stdin) - unless eof $ do - txt <- liftIO (T.hGetLine IO.stdin) - yield txt - go +{- $special + These simple 'lines' examples reveal a more important difference from @Data.Text.Lazy@ . + This is in the types that are most closely associated with our central text type, + @Producer Text m r@. In @Data.Text@ and @Data.Text.Lazy@ we find functions like +> splitAt :: Int -> Text -> (Text, Text) +> lines :: Text -> [Text] +> chunksOf :: Int -> Text -> [Text] -{-| Stream text to 'stdout' + which relate a Text with a pair of Texts or a list of Texts. + The corresponding functions here (taking account of \'lensification\') are - Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. +> view . splitAt :: (Monad m, Integral n) => n -> Producer Text m r -> Producer Text m (Producer Text m r) +> view lines :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r +> view . chunksOf :: (Monad m, Integral n) => n -> Producer Text m r -> FreeT (Producer Text m) m r - Note: For best performance, use @(for source (liftIO . putStr))@ instead of - @(source >-> stdout)@ in suitable cases. --} -stdout :: MonadIO m => Consumer' Text m () -stdout = go - where - go = do - txt <- await - x <- liftIO $ try (T.putStr txt) - case x of - Left (G.IOError { G.ioe_type = G.ResourceVanished - , G.ioe_errno = Just ioe }) - | Errno ioe == ePIPE - -> return () - Left e -> liftIO (throwIO e) - Right () -> go -{-# INLINABLE stdout #-} - -stdoutLn :: (MonadIO m) => Consumer' Text m () -stdoutLn = go - where - go = do - str <- await - x <- liftIO $ try (T.putStrLn str) - case x of - Left (G.IOError { G.ioe_type = G.ResourceVanished - , G.ioe_errno = Just ioe }) - | Errno ioe == ePIPE - -> return () - Left e -> liftIO (throwIO e) - Right () -> go -{-# INLINABLE stdoutLn #-} - -{-| Convert a text stream into a 'Handle' - - Note: again, for best performance, where possible use - @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@. + Some of the types may be more readable if you imagine that we have introduced + our own type synonyms + +> type Text m r = Producer T.Text m r +> type Texts m r = FreeT (Producer T.Text m) m r + + Then we would think of the types above as + +> view . splitAt :: (Monad m, Integral n) => n -> Text m r -> Text m (Text m r) +> view lines :: (Monad m) => Text m r -> Texts m r +> view . chunksOf :: (Monad m, Integral n) => n -> Text m r -> Texts m r + + which brings one closer to the types of the similar functions in @Data.Text.Lazy@ + + In the type @Producer Text m (Producer Text m r)@ the second + element of the \'pair\' of effectful Texts cannot simply be retrieved + with something like 'snd'. This is an \'effectful\' pair, and one must work + through the effects of the first element to arrive at the second Text stream, even + if you are proposing to throw the Text in the first element away. + Note that we use Control.Monad.join to fuse the pair back together, since it specializes to + +> join :: Monad m => Producer Text m (Producer m r) -> Producer m r + + The return type of 'lines', 'words', 'chunksOf' and the other /splitter/ functions, + @FreeT (Producer m Text) m r@ -- our @Texts m r@ -- is the type of (effectful) + lists of (effectful) texts. The type @([Text],r)@ might be seen to gather + together things of the forms: + +> r +> (Text,r) +> (Text, (Text, r)) +> (Text, (Text, (Text, r))) +> (Text, (Text, (Text, (Text, r)))) +> ... + + (We might also have identified the sum of those types with @Free ((,) Text) r@ + -- or, more absurdly, @FreeT ((,) Text) Identity r@.) + + Similarly, our type @Texts m r@, or @FreeT (Text m) m r@ -- in fact called + @FreeT (Producer Text m) m r@ here -- encompasses all the members of the sequence: + +> m r +> Text m r +> Text m (Text m r) +> Text m (Text m (Text m r)) +> Text m (Text m (Text m (Text m r))) +> ... + + We might have used a more specialized type in place of @FreeT (Producer a m) m r@, + or indeed of @FreeT (Producer Text m) m r@, but it is clear that the correct + result type of 'lines' will be isomorphic to @FreeT (Producer Text m) m r@ . + + One might think that + +> lines :: Monad m => Lens'_ (Producer Text m r) (FreeT (Producer Text m) m r) +> view . lines :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r + + should really have the type + +> lines :: Monad m => Pipe Text Text m r + + as e.g. 'toUpper' does. But this would spoil the control we are + attempting to maintain over the size of chunks. It is in fact just + as unreasonable to want such a pipe as to want + +> Data.Text.Lazy.lines :: Text -> Text + + to 'rechunk' the strict Text chunks inside the lazy Text to respect + line boundaries. In fact we have + +> Data.Text.Lazy.lines :: Text -> [Text] +> Prelude.lines :: String -> [String] + + where the elements of the list are themselves lazy Texts or Strings; the use + of @FreeT (Producer Text m) m r@ is simply the 'effectful' version of this. + + The @Pipes.Group@ module, which can generally be imported without qualification, + provides many functions for working with things of type @FreeT (Producer a m) m r@. + In particular it conveniently exports the constructors for @FreeT@ and the associated + @FreeF@ type -- a fancy form of @Either@, namely + +> data FreeF f a b = Pure a | Free (f b) + + for pattern-matching. Consider the implementation of the 'words' function, or + of the part of the lens that takes us to the words; it is compact but exhibits many + of the points under discussion, including explicit handling of the @FreeT@ and @FreeF@ + constuctors. Keep in mind that + +> newtype FreeT f m a = FreeT (m (FreeF f a (FreeT f m a))) +> next :: Monad m => Producer a m r -> m (Either r (a, Producer a m r)) + + Thus the @do@ block after the @FreeT@ constructor is in the base monad, e.g. 'IO' or 'Identity'; + the later subordinate block, opened by the @Free@ constructor, is in the @Producer@ monad: + +> words :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r +> words p = FreeT $ do -- With 'next' we will inspect p's first chunk, excluding spaces; +> x <- next (p >-> dropWhile isSpace) -- note that 'dropWhile isSpace' is a pipe, and is thus *applied* with '>->'. +> return $ case x of -- We use 'return' and so need something of type 'FreeF (Text m) r (Texts m r)' +> Left r -> Pure r -- 'Left' means we got no Text chunk, but only the return value; so we are done. +> Right (txt, p') -> Free $ do -- If we get a chunk and the rest of the producer, p', we enter the 'Producer' monad +> p'' <- view (break isSpace) -- When we apply 'break isSpace', we get a Producer that returns a Producer; +> (yield txt >> p') -- so here we yield everything up to the next space, and get the rest back. +> return (words p'') -- We then carry on with the rest, which is likely to begin with space. + -} -toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r -toHandle h = for cat (liftIO . T.hPutStr h) -{-# INLINABLE toHandle #-} --- | Stream text into a file. Uses @pipes-safe@. -writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m () -writeFile file = Safe.withFile file IO.WriteMode toHandle +-- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's +fromLazy :: (Monad m) => TL.Text -> Producer' Text m () +fromLazy = TL.foldrChunks (\e a -> yield e >> a) (return ()) +{-# INLINE fromLazy #-} + + +(^.) :: a -> ((b -> Constant b b) -> (a -> Constant b a)) -> b +a ^. lens = getConstant (lens Constant a) + -- | Apply a transformation to each 'Char' in the stream map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r map f = P.map (T.map f) {-# INLINABLE map #-} +{-# RULES "p >-> map f" forall p f . + p >-> map f = for p (\txt -> yield (T.map f txt)) + #-} + -- | Map a function over the characters of a text stream and concatenate the results concatMap :: (Monad m) => (Char -> Text) -> Pipe Text Text m r concatMap f = P.map (T.concatMap f) {-# INLINABLE concatMap #-} +{-# RULES "p >-> concatMap f" forall p f . + p >-> concatMap f = for p (\txt -> yield (T.concatMap f txt)) + #-} --- | Transform a Pipe of 'Text' into a Pipe of 'ByteString's using UTF-8 --- encoding -encodeUtf8 :: Monad m => Pipe Text ByteString m r -encodeUtf8 = P.map TE.encodeUtf8 -{-# INLINEABLE encodeUtf8 #-} -- | Transform a Pipe of 'String's into one of 'Text' chunks pack :: Monad m => Pipe String Text m r pack = P.map T.pack {-# INLINEABLE pack #-} --- | Transforma a Pipes of 'Text' chunks into one of 'String's +{-# RULES "p >-> pack" forall p . + p >-> pack = for p (\txt -> yield (T.pack txt)) + #-} + +-- | Transform a Pipes of 'Text' chunks into one of 'String's unpack :: Monad m => Pipe Text String m r -unpack = P.map T.unpack +unpack = for cat (\t -> yield (T.unpack t)) {-# INLINEABLE unpack #-} --- | @toCaseFold@, @toLower@, @toUpper@ and @stripStart@ are standard 'Text' utility, --- here acting on a 'Text' pipe, rather as they would on a lazy text -toCaseFold :: Monad m => Pipe Text Text m () +{-# RULES "p >-> unpack" forall p . + p >-> unpack = for p (\txt -> yield (T.unpack txt)) + #-} + +-- | @toCaseFold@, @toLower@, @toUpper@ and @stripStart@ are standard 'Text' utilities, +-- here acting as 'Text' pipes, rather as they would on a lazy text +toCaseFold :: Monad m => Pipe Text Text m r toCaseFold = P.map T.toCaseFold {-# INLINEABLE toCaseFold #-} +{-# RULES "p >-> toCaseFold" forall p . + p >-> toCaseFold = for p (\txt -> yield (T.toCaseFold txt)) + #-} + + -- | lowercase incoming 'Text' -toLower :: Monad m => Pipe Text Text m () +toLower :: Monad m => Pipe Text Text m r toLower = P.map T.toLower {-# INLINEABLE toLower #-} +{-# RULES "p >-> toLower" forall p . + p >-> toLower = for p (\txt -> yield (T.toLower txt)) + #-} + -- | uppercase incoming 'Text' -toUpper :: Monad m => Pipe Text Text m () +toUpper :: Monad m => Pipe Text Text m r toUpper = P.map T.toUpper {-# INLINEABLE toUpper #-} +{-# RULES "p >-> toUpper" forall p . + p >-> toUpper = for p (\txt -> yield (T.toUpper txt)) + #-} + -- | Remove leading white space from an incoming succession of 'Text's stripStart :: Monad m => Pipe Text Text m r stripStart = do @@ -356,7 +502,8 @@ stripStart = do let text = T.stripStart chunk if T.null text then stripStart - else cat + else do yield text + cat {-# INLINEABLE stripStart #-} -- | @(take n)@ only allows @n@ individual characters to pass; @@ -421,18 +568,23 @@ filter :: (Monad m) => (Char -> Bool) -> Pipe Text Text m r filter predicate = P.map (T.filter predicate) {-# INLINABLE filter #-} - +{-# RULES "p >-> filter q" forall p q . + p >-> filter q = for p (\txt -> yield (T.filter q txt)) + #-} + -- | Strict left scan over the characters scan :: (Monad m) => (Char -> Char -> Char) -> Char -> Pipe Text Text m r -scan step begin = go begin +scan step begin = do + yield (T.singleton begin) + go begin where go c = do txt <- await let txt' = T.scanl step c txt c' = T.last txt' - yield txt' + yield (T.tail txt') go c' {-# INLINABLE scan #-} @@ -455,11 +607,11 @@ toLazyM = liftM TL.fromChunks . P.toListM {-# INLINABLE toLazyM #-} -- | Reduce the text stream using a strict left fold over characters -fold +foldChars :: Monad m => (x -> Char -> x) -> x -> (x -> r) -> Producer Text m () -> m r -fold step begin done = P.fold (T.foldl' step) begin done -{-# INLINABLE fold #-} +foldChars step begin done = P.fold (T.foldl' step) begin done +{-# INLINABLE foldChars #-} -- | Retrieve the first 'Char' head :: (Monad m) => Producer Text m () -> m (Maybe Char) @@ -550,59 +702,92 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) {-# INLINABLE count #-} -#if MIN_VERSION_text(0,11,4) --- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded --- into a Pipe of Text -decodeUtf8 - :: Monad m - => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8 = go TE.streamDecodeUtf8 - where go dec p = do - x <- lift (next p) - case x of - Left r -> return (return r) - Right (chunk, p') -> do - let TE.Some text l dec' = dec chunk - if B.null l - then do - yield text - go dec' p' - else return $ do - yield l - p' -{-# INLINEABLE decodeUtf8 #-} - --- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded --- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ --- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" -decodeUtf8With - :: Monad m - => TE.OnDecodeError - -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) - where go dec p = do - x <- lift (next p) - case x of - Left r -> return (return r) - Right (chunk, p') -> do - let TE.Some text l dec' = dec chunk - if B.null l - then do - yield text - go dec' p' - else return $ do - yield l - p' -{-# INLINEABLE decodeUtf8With #-} -#endif + +-- | Consume the first character from a stream of 'Text' +-- +-- 'next' either fails with a 'Left' if the 'Producer' has no more characters or +-- succeeds with a 'Right' providing the next character and the remainder of the +-- 'Producer'. + +nextChar + :: (Monad m) + => Producer Text m r + -> m (Either r (Char, Producer Text m r)) +nextChar = go + where + go p = do + x <- next p + case x of + Left r -> return (Left r) + Right (txt, p') -> case (T.uncons txt) of + Nothing -> go p' + Just (c, txt') -> return (Right (c, yield txt' >> p')) +{-# INLINABLE nextChar #-} + +-- | Draw one 'Char' from a stream of 'Text', returning 'Left' if the 'Producer' is empty + +drawChar :: (Monad m) => Parser Text m (Maybe Char) +drawChar = do + x <- PP.draw + case x of + Nothing -> return Nothing + Just txt -> case (T.uncons txt) of + Nothing -> drawChar + Just (c, txt') -> do + PP.unDraw txt' + return (Just c) +{-# INLINABLE drawChar #-} + +-- | Push back a 'Char' onto the underlying 'Producer' +unDrawChar :: (Monad m) => Char -> Parser Text m () +unDrawChar c = modify (yield (T.singleton c) >>) +{-# INLINABLE unDrawChar #-} + +{-| 'peekChar' checks the first 'Char' in the stream, but uses 'unDrawChar' to + push the 'Char' back + +> peekChar = do +> x <- drawChar +> case x of +> Left _ -> return () +> Right c -> unDrawChar c +> return x + +-} + +peekChar :: (Monad m) => Parser Text m (Maybe Char) +peekChar = do + x <- drawChar + case x of + Nothing -> return () + Just c -> unDrawChar c + return x +{-# INLINABLE peekChar #-} + +{-| Check if the underlying 'Producer' has no more characters + + Note that this will skip over empty 'Text' chunks, unlike + 'PP.isEndOfInput' from @pipes-parse@, which would consider + an empty 'Text' a valid bit of input. + +> isEndOfChars = liftM isLeft peekChar +-} +isEndOfChars :: (Monad m) => Parser Text m Bool +isEndOfChars = do + x <- peekChar + return (case x of + Nothing -> True + Just _-> False ) +{-# INLINABLE isEndOfChars #-} + -- | Splits a 'Producer' after the given number of characters splitAt :: (Monad m, Integral n) => n - -> Producer Text m r - -> Producer' Text m (Producer Text m r) -splitAt = go + -> Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) +splitAt n0 k p0 = fmap join (k (go n0 p0)) where go 0 p = return p go n p = do @@ -621,30 +806,17 @@ splitAt = go return (yield suffix >> p') {-# INLINABLE splitAt #-} --- | Split a text stream into 'FreeT'-delimited text streams of fixed size -chunksOf - :: (Monad m, Integral n) - => n -> Producer Text m r -> FreeT (Producer Text m) m r -chunksOf n p0 = PP.FreeT (go p0) - where - go p = do - x <- next p - return $ case x of - Left r -> PP.Pure r - Right (txt, p') -> PP.Free $ do - p'' <- splitAt n (yield txt >> p') - return $ PP.FreeT (go p'') -{-# INLINABLE chunksOf #-} -{-| Split a text stream in two, where the first text stream is the longest - consecutive group of text that satisfy the predicate --} +-- | Split a text stream in two, producing the longest +-- consecutive group of characters that satisfies the predicate +-- and returning the rest + span :: (Monad m) => (Char -> Bool) - -> Producer Text m r - -> Producer' Text m (Producer Text m r) -span predicate = go + -> Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) +span predicate k p0 = fmap join (k (go p0)) where go p = do x <- lift (next p) @@ -661,132 +833,66 @@ span predicate = go return (yield suffix >> p') {-# INLINABLE span #-} -{-| Split a text stream in two, where the first text stream is the longest +{-| Split a text stream in two, producing the longest consecutive group of characters that don't satisfy the predicate -} break :: (Monad m) => (Char -> Bool) - -> Producer Text m r - -> Producer Text m (Producer Text m r) + -> Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) break predicate = span (not . predicate) {-# INLINABLE break #-} -{-| Split a text stream into sub-streams delimited by characters that satisfy the - predicate --} -splitWith - :: (Monad m) - => (Char -> Bool) - -> Producer Text m r - -> PP.FreeT (Producer Text m) m r -splitWith predicate p0 = PP.FreeT (go0 p0) - where - go0 p = do - x <- next p - case x of - Left r -> return (PP.Pure r) - Right (txt, p') -> - if (T.null txt) - then go0 p' - else return $ PP.Free $ do - p'' <- span (not . predicate) (yield txt >> p') - return $ PP.FreeT (go1 p'') - go1 p = do - x <- nextChar p - return $ case x of - Left r -> PP.Pure r - Right (_, p') -> PP.Free $ do - p'' <- span (not . predicate) p' - return $ PP.FreeT (go1 p'') -{-# INLINABLE splitWith #-} - --- | Split a text stream using the given 'Char' as the delimiter -split :: (Monad m) - => Char - -> Producer Text m r - -> FreeT (Producer Text m) m r -split c = splitWith (c ==) -{-# INLINABLE split #-} - -{-| Group a text stream into 'FreeT'-delimited text streams using the supplied - equality predicate +{-| Improper lens that splits after the first group of equivalent Chars, as + defined by the given equivalence relation -} groupBy :: (Monad m) => (Char -> Char -> Bool) - -> Producer Text m r - -> FreeT (Producer Text m) m r -groupBy equal p0 = PP.FreeT (go p0) - where + -> Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) +groupBy equals k p0 = fmap join (k ((go p0))) where go p = do - x <- next p + x <- lift (next p) case x of - Left r -> return (PP.Pure r) - Right (txt, p') -> case (T.uncons txt) of + Left r -> return (return r) + Right (txt, p') -> case T.uncons txt of Nothing -> go p' - Just (c, _) -> do - return $ PP.Free $ do - p'' <- span (equal c) (yield txt >> p') - return $ PP.FreeT (go p'') + Just (c, _) -> (yield txt >> p') ^. span (equals c) {-# INLINABLE groupBy #-} --- | Group a text stream into 'FreeT'-delimited text streams of identical characters -group - :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r +-- | Improper lens that splits after the first succession of identical 'Char' s +group :: Monad m + => Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) group = groupBy (==) {-# INLINABLE group #-} -{-| Split a text stream into 'FreeT'-delimited lines +{-| Improper lens that splits a 'Producer' after the first word + + Unlike 'words', this does not drop leading whitespace -} -lines - :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r -lines p0 = PP.FreeT (go0 p0) +word :: (Monad m) + => Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) +word k p0 = fmap join (k (to p0)) where - go0 p = do - x <- next p - case x of - Left r -> return (PP.Pure r) - Right (txt, p') -> - if (T.null txt) - then go0 p' - else return $ PP.Free $ go1 (yield txt >> p') - go1 p = do - p' <- break ('\n' ==) p - return $ PP.FreeT (go2 p') - go2 p = do - x <- nextChar p - return $ case x of - Left r -> PP.Pure r - Right (_, p') -> PP.Free (go1 p') -{-# INLINABLE lines #-} + to p = do + p' <- p^.span isSpace + p'^.break isSpace +{-# INLINABLE word #-} +line :: (Monad m) + => Lens'_ (Producer Text m r) + (Producer Text m (Producer Text m r)) +line = break (== '\n') --- | Split a text stream into 'FreeT'-delimited words -words - :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r -words p0 = removeEmpty (splitWith isSpace p0) - where - removeEmpty f = PP.FreeT $ do - x <- PP.runFreeT f - case x of - PP.Pure r -> return (PP.Pure r) - PP.Free p -> loop p - loop p = do - y <- next p - case y of - Left f' -> PP.runFreeT (removeEmpty f') - Right (txt, p') -> - if T.null txt - then loop p' - else return $ PP.Free $ do - yield txt - f' <- p' - return (removeEmpty f') -{-# INLINABLE words #-} +{-# INLINABLE line #-} --- | Intersperse a 'Char' in between the characters of the text stream + +-- | Intersperse a 'Char' in between the characters of stream of 'Text' intersperse :: (Monad m) => Char -> Producer Text m r -> Producer Text m r intersperse c = go0 @@ -808,6 +914,158 @@ intersperse c = go0 go1 p' {-# INLINABLE intersperse #-} + + +-- | Improper isomorphism between a 'Producer' of 'ByteString's and 'Word8's +packChars :: Monad m => Iso'_ (Producer Char m x) (Producer Text m x) +packChars = Data.Profunctor.dimap to (fmap from) + where + -- to :: Monad m => Producer Char m x -> Producer Text m x + to p = PG.folds step id done (p^.PG.chunksOf defaultChunkSize) + + step diffAs c = diffAs . (c:) + + done diffAs = T.pack (diffAs []) + + -- from :: Monad m => Producer Text m x -> Producer Char m x + from p = for p (each . T.unpack) + +{-# INLINABLE packChars #-} + +defaultChunkSize :: Int +defaultChunkSize = 16384 - (sizeOf (undefined :: Int) `shiftL` 1) + +-- | Split a text stream into 'FreeT'-delimited text streams of fixed size +chunksOf + :: (Monad m, Integral n) + => n -> Lens'_ (Producer Text m r) + (FreeT (Producer Text m) m r) +chunksOf n k p0 = fmap concats (k (FreeT (go p0))) + where + go p = do + x <- next p + return $ case x of + Left r -> Pure r + Right (txt, p') -> Free $ do + p'' <- (yield txt >> p') ^. splitAt n + return $ FreeT (go p'') +{-# INLINABLE chunksOf #-} + + +{-| Split a text stream into sub-streams delimited by characters that satisfy the + predicate +-} +splitsWith + :: (Monad m) + => (Char -> Bool) + -> Producer Text m r + -> FreeT (Producer Text m) m r +splitsWith predicate p0 = FreeT (go0 p0) + where + go0 p = do + x <- next p + case x of + Left r -> return (Pure r) + Right (txt, p') -> + if (T.null txt) + then go0 p' + else return $ Free $ do + p'' <- (yield txt >> p') ^. span (not . predicate) + return $ FreeT (go1 p'') + go1 p = do + x <- nextChar p + return $ case x of + Left r -> Pure r + Right (_, p') -> Free $ do + p'' <- p' ^. span (not . predicate) + return $ FreeT (go1 p'') +{-# INLINABLE splitsWith #-} + +-- | Split a text stream using the given 'Char' as the delimiter +splits :: (Monad m) + => Char + -> Lens'_ (Producer Text m r) + (FreeT (Producer Text m) m r) +splits c k p = + fmap (PG.intercalates (yield (T.singleton c))) (k (splitsWith (c ==) p)) +{-# INLINABLE splits #-} + +{-| Isomorphism between a stream of 'Text' and groups of equivalent 'Char's , using the + given equivalence relation +-} +groupsBy + :: Monad m + => (Char -> Char -> Bool) + -> Lens'_ (Producer Text m x) (FreeT (Producer Text m) m x) +groupsBy equals k p0 = fmap concats (k (FreeT (go p0))) where + go p = do x <- next p + case x of Left r -> return (Pure r) + Right (bs, p') -> case T.uncons bs of + Nothing -> go p' + Just (c, _) -> do return $ Free $ do + p'' <- (yield bs >> p')^.span (equals c) + return $ FreeT (go p'') +{-# INLINABLE groupsBy #-} + + +-- | Like 'groupsBy', where the equality predicate is ('==') +groups + :: Monad m + => Lens'_ (Producer Text m x) (FreeT (Producer Text m) m x) +groups = groupsBy (==) +{-# INLINABLE groups #-} + + + +{-| Split a text stream into 'FreeT'-delimited lines +-} +lines + :: (Monad m) => Iso'_ (Producer Text m r) (FreeT (Producer Text m) m r) +lines = Data.Profunctor.dimap _lines (fmap _unlines) + where + _lines p0 = FreeT (go0 p0) + where + go0 p = do + x <- next p + case x of + Left r -> return (Pure r) + Right (txt, p') -> + if (T.null txt) + then go0 p' + else return $ Free $ go1 (yield txt >> p') + go1 p = do + p' <- p ^. break ('\n' ==) + return $ FreeT $ do + x <- nextChar p' + case x of + Left r -> return $ Pure r + Right (_, p'') -> go0 p'' + -- _unlines + -- :: Monad m + -- => FreeT (Producer Text m) m x -> Producer Text m x + _unlines = concats . PG.maps (<* yield (T.singleton '\n')) + + +{-# INLINABLE lines #-} + + +-- | Split a text stream into 'FreeT'-delimited words +words + :: (Monad m) => Iso'_ (Producer Text m r) (FreeT (Producer Text m) m r) +words = Data.Profunctor.dimap go (fmap _unwords) + where + go p = FreeT $ do + x <- next (p >-> dropWhile isSpace) + return $ case x of + Left r -> Pure r + Right (bs, p') -> Free $ do + p'' <- (yield bs >> p') ^. break isSpace + return (go p'') + _unwords = PG.intercalates (yield $ T.singleton ' ') + +{-# INLINABLE words #-} + + {-| 'intercalate' concatenates the 'FreeT'-delimited text streams after interspersing a text stream in between them -} @@ -819,17 +1077,17 @@ intercalate intercalate p0 = go0 where go0 f = do - x <- lift (PP.runFreeT f) + x <- lift (runFreeT f) case x of - PP.Pure r -> return r - PP.Free p -> do + Pure r -> return r + Free p -> do f' <- p go1 f' go1 f = do - x <- lift (PP.runFreeT f) + x <- lift (runFreeT f) case x of - PP.Pure r -> return r - PP.Free p -> do + Pure r -> return r + Free p -> do p0 f' <- p go1 f' @@ -842,10 +1100,10 @@ unlines unlines = go where go f = do - x <- lift (PP.runFreeT f) + x <- lift (runFreeT f) case x of - PP.Pure r -> return r - PP.Free p -> do + Pure r -> return r + Free p -> do f' <- p yield $ T.singleton '\n' go f' @@ -855,18 +1113,15 @@ unlines = go -} unwords :: (Monad m) => FreeT (Producer Text m) m r -> Producer Text m r -unwords = intercalate (yield $ T.pack " ") +unwords = intercalate (yield $ T.singleton ' ') {-# INLINABLE unwords #-} -{- $parse - The following parsing utilities are single-character analogs of the ones found - @pipes-parse@. --} {- $reexports - @Pipes.Text.Parse@ re-exports 'nextChar', 'drawChar', 'unDrawChar', 'peekChar', and 'isEndOfChars'. @Data.Text@ re-exports the 'Text' type. - @Pipes.Parse@ re-exports 'input', 'concat', and 'FreeT' (the type). --} \ No newline at end of file + @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. +-} + +