From: michaelt Date: Sun, 9 Nov 2014 03:34:33 +0000 (-0500) Subject: tutorial X-Git-Url: https://git.immae.eu/?a=commitdiff_plain;h=04f81dff0ebebd3df971ee8b456dd3328c65ff84;hp=e158530cf869b1921f25b58c2614b121b4f156d7;p=github%2Ffretlink%2Ftext-pipes.git tutorial --- diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 45b9299..7722f7f 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -1,24 +1,11 @@ {-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns, Trustworthy #-} -module Pipes.Text ( - -- * Effectful Text - -- $intro - - -- * Lenses - -- $lenses - - -- ** @view@ \/ @(^.)@ - -- $view - - -- ** @over@ \/ @(%~)@ - -- $over - - -- ** @zoom@ - -- $zoom - - -- * Special types: @Producer Text m (Producer Text m r)@ and @FreeT (Producer Text m) m r@ - -- $special +{-| The module @Pipes.Text@ closely follows @Pipes.ByteString@ from + the @pipes-bytestring@ package. A draft tutorial can be found in + @Pipes.Text.Tutorial@. +-} +module Pipes.Text ( -- * Producers fromLazy @@ -141,274 +128,6 @@ import Prelude hiding ( words, writeFile ) -{- $intro - This package provides @pipes@ utilities for /text streams/ or /character streams/, - realized as streams of 'Text' chunks. The individual chunks are uniformly /strict/, - and thus you will generally want @Data.Text@ in scope. But the type - @Producer Text m r@ ,as we are using it, is a sort of /pipes/ equivalent of the lazy @Text@ type. - - This particular module provides many functions equivalent in one way or another to - the pure functions in - . - They transform, divide, group and fold text streams. Though @Producer Text m r@ - is the type of \'effectful Text\', the functions in this module are \'pure\' - in the sense that they are uniformly monad-independent. - Simple /IO/ operations are defined in @Pipes.Text.IO@ -- as lazy IO @Text@ - operations are in @Data.Text.Lazy.IO@. Inter-operation with @ByteString@ - is provided in @Pipes.Text.Encoding@, which parallels @Data.Text.Lazy.Encoding@. - - The Text type exported by @Data.Text.Lazy@ is basically that of a lazy list of - strict Text: the implementation is arranged so that the individual strict 'Text' - chunks are kept to a reasonable size; the user is not aware of the divisions - between the connected 'Text' chunks. - So also here: the functions in this module are designed to operate on streams that - are insensitive to text boundaries. This means that they may freely split - text into smaller texts and /discard empty texts/. The objective, though, is - that they should /never concatenate texts/ in order to provide strict upper - bounds on memory usage. - - For example, to stream only the first three lines of 'stdin' to 'stdout' you - might write: - -> import Pipes -> import qualified Pipes.Text as Text -> import qualified Pipes.Text.IO as Text -> import Pipes.Group (takes') -> import Lens.Family -> -> main = runEffect $ takeLines 3 Text.stdin >-> Text.stdout -> where -> takeLines n = Text.unlines . takes' n . view Text.lines - - The above program will never bring more than one chunk of text (~ 32 KB) into - memory, no matter how long the lines are. - --} -{- $lenses - As this example shows, one superficial difference from @Data.Text.Lazy@ - is that many of the operations, like 'lines', are \'lensified\'; this has a - number of advantages (where it is possible); in particular it facilitates their - use with 'Parser's of Text (in the general - sense.) The disadvantage, famously, is that the messages you get for type errors can be - a little alarming. The remarks that follow in this section are for non-lens adepts. - - Each lens exported here, e.g. 'lines', 'chunksOf' or 'splitAt', reduces to the - intuitively corresponding function when used with @view@ or @(^.)@. Instead of - writing: - - > splitAt 17 producer - - as we would with the Prelude or Text functions, we write - - > view (splitAt 17) producer - - or equivalently - - > producer ^. splitAt 17 - - This may seem a little indirect, but note that many equivalents of - @Text -> Text@ functions are exported here as 'Pipe's. Here too we recover the intuitively - corresponding functions by prefixing them with @(>->)@. Thus something like - -> stripLines = Text.unlines . Group.maps (>-> Text.stripStart) . view Text.lines - - would drop the leading white space from each line. - - The lenses in this library are marked as /improper/; this just means that - they don't admit all the operations of an ideal lens, but only /getting/ and /focusing/. - Just for this reason, though, the magnificent complexities of the lens libraries - are a distraction. The lens combinators to keep in mind, the ones that make sense for - our lenses, are @view@ \/ @(^.)@), @over@ \/ @(%~)@ , and @zoom@. - - One need only keep in mind that if @l@ is a @Lens' a b@, then: - --} -{- $view - @view l@ is a function @a -> b@ . Thus @view l a@ (also written @a ^. l@ ) - is the corresponding @b@; as was said above, this function will be exactly the - function you think it is, given its name. Thus to uppercase the first n characters - of a Producer, leaving the rest the same, we could write: - - - > upper n p = do p' <- p ^. Text.splitAt n >-> Text.toUpper - > p' --} -{- $over - @over l@ is a function @(b -> b) -> a -> a@. Thus, given a function that modifies - @b@s, the lens lets us modify an @a@ by applying @f :: b -> b@ to - the @b@ that we can \"see\" through the lens. So @over l f :: a -> a@ - (it can also be written @l %~ f@). - For any particular @a@, then, @over l f a@ or @(l %~ f) a@ is a revised @a@. - So above we might have written things like these: - - > stripLines = Text.lines %~ maps (>-> Text.stripStart) - > stripLines = over Text.lines (maps (>-> Text.stripStart)) - > upper n = Text.splitAt n %~ (>-> Text.toUpper) - --} -{- $zoom - @zoom l@, finally, is a function from a @Parser b m r@ - to a @Parser a m r@ (or more generally a @StateT (Producer b m x) m r@). - Its use is easiest to see with an decoding lens like 'utf8', which - \"sees\" a Text producer hidden inside a ByteString producer: - @drawChar@ is a Text parser, returning a @Maybe Char@, @zoom utf8 drawChar@ is - a /ByteString/ parser, returning a @Maybe Char@. @drawAll@ is a Parser that returns - a list of everything produced from a Producer, leaving only the return value; it would - usually be unreasonable to use it. But @zoom (splitAt 17) drawAll@ - returns a list of Text chunks containing the first seventeen Chars, and returns the rest of - the Text Producer for further parsing. Suppose that we want, inexplicably, to - modify the casing of a Text Producer according to any instruction it might - contain at the start. Then we might write something like this: - -> obey :: Monad m => Producer Text m b -> Producer Text m b -> obey p = do (ts, p') <- lift $ runStateT (zoom (Text.splitAt 7) drawAll) p -> let seven = T.concat ts -> case T.toUpper seven of -> "TOUPPER" -> p' >-> Text.toUpper -> "TOLOWER" -> p' >-> Text.toLower -> _ -> do yield seven -> p' - - -> >>> let doc = each ["toU","pperTh","is document.\n"] -> >>> runEffect $ obey doc >-> Text.stdout -> THIS DOCUMENT. - - The purpose of exporting lenses is the mental economy achieved with this three-way - applicability. That one expression, e.g. @lines@ or @splitAt 17@ can have these - three uses is no more surprising than that a pipe can act as a function modifying - the output of a producer, namely by using @>->@ to its left: @producer >-> pipe@ - -- but can /also/ modify the inputs to a consumer by using @>->@ to its right: - @pipe >-> consumer@ - - The three functions, @view@ \/ @(^.)@, @over@ \/ @(%~)@ and @zoom@ are supplied by - both and - The use of 'zoom' is explained - in - and to some extent in the @Pipes.Text.Encoding@ module here. - --} -{- $special - These simple 'lines' examples reveal a more important difference from @Data.Text.Lazy@ . - This is in the types that are most closely associated with our central text type, - @Producer Text m r@. In @Data.Text@ and @Data.Text.Lazy@ we find functions like - -> splitAt :: Int -> Text -> (Text, Text) -> lines :: Text -> [Text] -> chunksOf :: Int -> Text -> [Text] - - which relate a Text with a pair of Texts or a list of Texts. - The corresponding functions here (taking account of \'lensification\') are - -> view . splitAt :: (Monad m, Integral n) => n -> Producer Text m r -> Producer Text m (Producer Text m r) -> view lines :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r -> view . chunksOf :: (Monad m, Integral n) => n -> Producer Text m r -> FreeT (Producer Text m) m r - - Some of the types may be more readable if you imagine that we have introduced - our own type synonyms - -> type Text m r = Producer T.Text m r -> type Texts m r = FreeT (Producer T.Text m) m r - - Then we would think of the types above as - -> view . splitAt :: (Monad m, Integral n) => n -> Text m r -> Text m (Text m r) -> view lines :: (Monad m) => Text m r -> Texts m r -> view . chunksOf :: (Monad m, Integral n) => n -> Text m r -> Texts m r - - which brings one closer to the types of the similar functions in @Data.Text.Lazy@ - - In the type @Producer Text m (Producer Text m r)@ the second - element of the \'pair\' of effectful Texts cannot simply be retrieved - with something like 'snd'. This is an \'effectful\' pair, and one must work - through the effects of the first element to arrive at the second Text stream, even - if you are proposing to throw the Text in the first element away. - Note that we use Control.Monad.join to fuse the pair back together, since it specializes to - -> join :: Monad m => Producer Text m (Producer m r) -> Producer m r - - The return type of 'lines', 'words', 'chunksOf' and the other /splitter/ functions, - @FreeT (Producer m Text) m r@ -- our @Texts m r@ -- is the type of (effectful) - lists of (effectful) texts. The type @([Text],r)@ might be seen to gather - together things of the forms: - -> r -> (Text,r) -> (Text, (Text, r)) -> (Text, (Text, (Text, r))) -> (Text, (Text, (Text, (Text, r)))) -> ... - - (We might also have identified the sum of those types with @Free ((,) Text) r@ - -- or, more absurdly, @FreeT ((,) Text) Identity r@.) - - Similarly, our type @Texts m r@, or @FreeT (Text m) m r@ -- in fact called - @FreeT (Producer Text m) m r@ here -- encompasses all the members of the sequence: - -> m r -> Text m r -> Text m (Text m r) -> Text m (Text m (Text m r)) -> Text m (Text m (Text m (Text m r))) -> ... - - We might have used a more specialized type in place of @FreeT (Producer a m) m r@, - or indeed of @FreeT (Producer Text m) m r@, but it is clear that the correct - result type of 'lines' will be isomorphic to @FreeT (Producer Text m) m r@ . - - One might think that - -> lines :: Monad m => Lens' (Producer Text m r) (FreeT (Producer Text m) m r) -> view . lines :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r - - should really have the type - -> lines :: Monad m => Pipe Text Text m r - - as e.g. 'toUpper' does. But this would spoil the control we are - attempting to maintain over the size of chunks. It is in fact just - as unreasonable to want such a pipe as to want - -> Data.Text.Lazy.lines :: Text -> Text - - to 'rechunk' the strict Text chunks inside the lazy Text to respect - line boundaries. In fact we have - -> Data.Text.Lazy.lines :: Text -> [Text] -> Prelude.lines :: String -> [String] - - where the elements of the list are themselves lazy Texts or Strings; the use - of @FreeT (Producer Text m) m r@ is simply the 'effectful' version of this. - - The @Pipes.Group@ module, which can generally be imported without qualification, - provides many functions for working with things of type @FreeT (Producer a m) m r@. - In particular it conveniently exports the constructors for @FreeT@ and the associated - @FreeF@ type -- a fancy form of @Either@, namely - -> data FreeF f a b = Pure a | Free (f b) - - for pattern-matching. Consider the implementation of the 'words' function, or - of the part of the lens that takes us to the words; it is compact but exhibits many - of the points under discussion, including explicit handling of the @FreeT@ and @FreeF@ - constuctors. Keep in mind that - -> newtype FreeT f m a = FreeT (m (FreeF f a (FreeT f m a))) -> next :: Monad m => Producer a m r -> m (Either r (a, Producer a m r)) - - Thus the @do@ block after the @FreeT@ constructor is in the base monad, e.g. 'IO' or 'Identity'; - the later subordinate block, opened by the @Free@ constructor, is in the @Producer@ monad: - -> words :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r -> words p = FreeT $ do -- With 'next' we will inspect p's first chunk, excluding spaces; -> x <- next (p >-> dropWhile isSpace) -- note that 'dropWhile isSpace' is a pipe, and is thus *applied* with '>->'. -> return $ case x of -- We use 'return' and so need something of type 'FreeF (Text m) r (Texts m r)' -> Left r -> Pure r -- 'Left' means we got no Text chunk, but only the return value; so we are done. -> Right (txt, p') -> Free $ do -- If we get a chunk and the rest of the producer, p', we enter the 'Producer' monad -> p'' <- view (break isSpace) -- When we apply 'break isSpace', we get a Producer that returns a Producer; -> (yield txt >> p') -- so here we yield everything up to the next space, and get the rest back. -> return (words p'') -- We then carry on with the rest, which is likely to begin with space. - --} -- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's fromLazy :: (Monad m) => TL.Text -> Producer' Text m () diff --git a/Pipes/Text/Tutorial.hs b/Pipes/Text/Tutorial.hs new file mode 100644 index 0000000..07b8751 --- /dev/null +++ b/Pipes/Text/Tutorial.hs @@ -0,0 +1,310 @@ +{-# OPTIONS_GHC -fno-warn-unused-imports #-} + +module Pipes.Text.Tutorial ( + -- * Effectful Text + -- $intro + -- ** @Pipes.Text@ + -- $pipestext + -- ** @Pipes.Text.IO@ + -- $pipestextio + -- ** @Pipes.Text.Encoding@ + -- $pipestextencoding + -- * Lenses + -- $lenses + + -- ** @view@ \/ @(^.)@ + -- $view + + -- ** @over@ \/ @(%~)@ + -- $over + + -- ** @zoom@ + -- $zoom + + -- * Special types: @Producer Text m (Producer Text m r)@ and @FreeT (Producer Text m) m r@ + -- $special + ) where + +import Pipes +import Pipes.Text +import Pipes.Text.IO +import Pipes.Text.Encoding + +{- $intro + This package provides @pipes@ utilities for /character streams/, + realized as streams of 'Text' chunks. The individual chunks are uniformly /strict/, + and thus the @Text@ type we are using is the one from @Data.Text@, not @Data.Text.Lazy@ + But the type @Producer Text m r@, as we are using it, is a sort of /pipes/ equivalent of + the lazy @Text@ type. + + The main @Pipes.Text@ module provides many functions equivalent + in one way or another to the pure functions in + + (and the corresponding @Prelude@ functions for @String@ s): they transform, + divide, group and fold text streams. Though @Producer Text m r@ + is the type of \'effectful Text\', the functions in @Pipes.Text@ are \'pure\' + in the sense that they are uniformly monad-independent. + Simple /IO/ operations are defined in @Pipes.Text.IO@ - as lazy IO @Text@ + operations are in @Data.Text.Lazy.IO@. Similarly, as @Data.Text.Lazy.Encoding@ + handles inter-operation with @Data.ByteString.Lazy@, @Pipes.Text.Encoding@ provides for + interoperation with the \'effectful ByteStrings\' of @Pipes.ByteString@. + + Remember that the @Text@ type exported by @Data.Text.Lazy@ is basically + that of a lazy list of strict @Text@: the implementation is arranged so that + the individual strict 'Text' chunks are kept to a reasonable size; the user + is not aware of the divisions between the connected 'Text' chunks, but uses + operations akin to those for strict text. + So also here: the functions in this module are designed to operate on character streams that + in a way that is independent of the boundaries of the underlying @Text@ chunks. + This means that they may freely split text into smaller texts and /discard empty texts/. + The objective, though, is that they should not /concatenate texts/ in order to provide strict upper + bounds on memory usage. + + For example, to stream only the first three lines of 'stdin' to 'stdout' you + might write: + +> import Pipes +> import qualified Pipes.Text as Text +> import qualified Pipes.Text.IO as Text +> import Pipes.Group (takes') +> import Lens.Family (view) +> +> main = runEffect $ takeLines 3 Text.stdin >-> Text.stdout +> where +> takeLines n = view Text.unlines . takes' n . view Text.lines + + This program will never bring more into memory than what @Text.stdin@ considers + one chunk of text (~ 32 KB), even if individual lines are split across many chunks. + +-} +{- $lenses + As the use of @view@ in this example shows, one superficial difference from @Data.Text.Lazy@ + is that many of the operations, like 'lines', are \'lensified\'; this has a + number of advantages; in particular it facilitates their use with 'Parser's of Text + (in the general + sense.) The remarks that follow in this section are for non-lens adepts. + + Each lens exported here, e.g. 'lines', 'chunksOf' or 'splitAt', reduces to the + intuitively corresponding function when used with @view@ or @(^.)@. Instead of + writing: + + > splitAt 17 producer + + as we would with the Prelude or Text functions, we write + + > view (splitAt 17) producer + + or equivalently + + > producer ^. splitAt 17 + + This may seem a little indirect, but note that many equivalents of + @Text -> Text@ functions are exported here as 'Pipe's. Here too we recover the intuitively + corresponding functions by prefixing them with @(>->)@. Thus something like + +> stripLines = view Text.unlines . Group.maps (>-> Text.stripStart) . view Text.lines + + would drop the leading white space from each line. + + The lenses in this library are marked as /improper/; this just means that + they don't admit all the operations of an ideal lens, but only /getting/ and /focusing/. + Just for this reason, though, the magnificent complexities of the lens libraries + are a distraction. The lens combinators to keep in mind, the ones that make sense for + our lenses, are @view@ \/ @(^.)@), @over@ \/ @(%~)@ , and @zoom@. + + One need only keep in mind that if @l@ is a @Lens' a b@, then: + +-} +{- $view + @view l@ is a function @a -> b@ . Thus @view l a@ (also written @a ^. l@ ) + is the corresponding @b@; as was said above, this function will typically be + the pipes equivalent of the function you think it is, given its name. So for example + + > view (Text.drop) + > view (Text.splitAt 300) :: Producer Text m r -> Producer Text (Producer Text m r) + > Text.stdin ^. splitAt 300 :: Producer Text IO (Producer Text IO r) + + I.e., it produces the first 300 characters, and returns the rest of the producer. + Thus to uppercase the first n characters + of a Producer, leaving the rest the same, we could write: + + + > upper n p = do p' <- p ^. Text.splitAt n >-> Text.toUpper + > p' +-} +{- $over + @over l@ is a function @(b -> b) -> a -> a@. Thus, given a function that modifies + @b@s, the lens lets us modify an @a@ by applying @f :: b -> b@ to + the @b@ that we can \"see\" through the lens. So @over l f :: a -> a@ + (it can also be written @l %~ f@). + For any particular @a@, then, @over l f a@ or @(l %~ f) a@ is a revised @a@. + So above we might have written things like these: + + > stripLines = Text.lines %~ maps (>-> Text.stripStart) + > stripLines = over Text.lines (maps (>-> Text.stripStart)) + > upper n = Text.splitAt n %~ (>-> Text.toUpper) + +-} +{- $zoom + @zoom l@, finally, is a function from a @Parser b m r@ + to a @Parser a m r@ (or more generally a @StateT (Producer b m x) m r@). + Its use is easiest to see with an decoding lens like 'utf8', which + \"sees\" a Text producer hidden inside a ByteString producer: + @drawChar@ is a Text parser, returning a @Maybe Char@, @zoom utf8 drawChar@ is + a /ByteString/ parser, returning a @Maybe Char@. @drawAll@ is a Parser that returns + a list of everything produced from a Producer, leaving only the return value; it would + usually be unreasonable to use it. But @zoom (splitAt 17) drawAll@ + returns a list of Text chunks containing the first seventeen Chars, and returns the rest of + the Text Producer for further parsing. Suppose that we want, inexplicably, to + modify the casing of a Text Producer according to any instruction it might + contain at the start. Then we might write something like this: + +> obey :: Monad m => Producer Text m b -> Producer Text m b +> obey p = do (ts, p') <- lift $ runStateT (zoom (Text.splitAt 7) drawAll) p +> let seven = T.concat ts +> case T.toUpper seven of +> "TOUPPER" -> p' >-> Text.toUpper +> "TOLOWER" -> p' >-> Text.toLower +> _ -> do yield seven +> p' + + +> >>> let doc = each ["toU","pperTh","is document.\n"] +> >>> runEffect $ obey doc >-> Text.stdout +> THIS DOCUMENT. + + The purpose of exporting lenses is the mental economy achieved with this three-way + applicability. That one expression, e.g. @lines@ or @splitAt 17@ can have these + three uses is no more surprising than that a pipe can act as a function modifying + the output of a producer, namely by using @>->@ to its left: @producer >-> pipe@ + -- but can /also/ modify the inputs to a consumer by using @>->@ to its right: + @pipe >-> consumer@ + + The three functions, @view@ \/ @(^.)@, @over@ \/ @(%~)@ and @zoom@ are supplied by + both and + The use of 'zoom' is explained + in + and to some extent in the @Pipes.Text.Encoding@ module here. + +-} +{- $special + These simple 'lines' examples reveal a more important difference from @Data.Text.Lazy@ . + This is in the types that are most closely associated with our central text type, + @Producer Text m r@. In @Data.Text@ and @Data.Text.Lazy@ we find functions like + +> splitAt :: Int -> Text -> (Text, Text) +> lines :: Text -> [Text] +> chunksOf :: Int -> Text -> [Text] + + which relate a Text with a pair of Texts or a list of Texts. + The corresponding functions here (taking account of \'lensification\') are + +> view . splitAt :: (Monad m, Integral n) => n -> Producer Text m r -> Producer Text m (Producer Text m r) +> view lines :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r +> view . chunksOf :: (Monad m, Integral n) => n -> Producer Text m r -> FreeT (Producer Text m) m r + + Some of the types may be more readable if you imagine that we have introduced + our own type synonyms + +> type Text m r = Producer T.Text m r +> type Texts m r = FreeT (Producer T.Text m) m r + + Then we would think of the types above as + +> view . splitAt :: (Monad m, Integral n) => n -> Text m r -> Text m (Text m r) +> view lines :: (Monad m) => Text m r -> Texts m r +> view . chunksOf :: (Monad m, Integral n) => n -> Text m r -> Texts m r + + which brings one closer to the types of the similar functions in @Data.Text.Lazy@ + + In the type @Producer Text m (Producer Text m r)@ the second + element of the \'pair\' of effectful Texts cannot simply be retrieved + with something like 'snd'. This is an \'effectful\' pair, and one must work + through the effects of the first element to arrive at the second Text stream, even + if you are proposing to throw the Text in the first element away. + Note that we use Control.Monad.join to fuse the pair back together, since it specializes to + +> join :: Monad m => Producer Text m (Producer m r) -> Producer m r + + The return type of 'lines', 'words', 'chunksOf' and the other /splitter/ functions, + @FreeT (Producer m Text) m r@ -- our @Texts m r@ -- is the type of (effectful) + lists of (effectful) texts. The type @([Text],r)@ might be seen to gather + together things of the forms: + +> r +> (Text,r) +> (Text, (Text, r)) +> (Text, (Text, (Text, r))) +> (Text, (Text, (Text, (Text, r)))) +> ... + + (We might also have identified the sum of those types with @Free ((,) Text) r@ + -- or, more absurdly, @FreeT ((,) Text) Identity r@.) + + Similarly, our type @Texts m r@, or @FreeT (Text m) m r@ -- in fact called + @FreeT (Producer Text m) m r@ here -- encompasses all the members of the sequence: + +> m r +> Text m r +> Text m (Text m r) +> Text m (Text m (Text m r)) +> Text m (Text m (Text m (Text m r))) +> ... + + We might have used a more specialized type in place of @FreeT (Producer a m) m r@, + or indeed of @FreeT (Producer Text m) m r@, but it is clear that the correct + result type of 'lines' will be isomorphic to @FreeT (Producer Text m) m r@ . + + One might think that + +> lines :: Monad m => Lens' (Producer Text m r) (FreeT (Producer Text m) m r) +> view . lines :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r + + should really have the type + +> lines :: Monad m => Pipe Text Text m r + + as e.g. 'toUpper' does. But this would spoil the control we are + attempting to maintain over the size of chunks. It is in fact just + as unreasonable to want such a pipe as to want + +> Data.Text.Lazy.lines :: Text -> Text + + to 'rechunk' the strict Text chunks inside the lazy Text to respect + line boundaries. In fact we have + +> Data.Text.Lazy.lines :: Text -> [Text] +> Prelude.lines :: String -> [String] + + where the elements of the list are themselves lazy Texts or Strings; the use + of @FreeT (Producer Text m) m r@ is simply the 'effectful' version of this. + + The @Pipes.Group@ module, which can generally be imported without qualification, + provides many functions for working with things of type @FreeT (Producer a m) m r@. + In particular it conveniently exports the constructors for @FreeT@ and the associated + @FreeF@ type -- a fancy form of @Either@, namely + +> data FreeF f a b = Pure a | Free (f b) + + for pattern-matching. Consider the implementation of the 'words' function, or + of the part of the lens that takes us to the words; it is compact but exhibits many + of the points under discussion, including explicit handling of the @FreeT@ and @FreeF@ + constuctors. Keep in mind that + +> newtype FreeT f m a = FreeT (m (FreeF f a (FreeT f m a))) +> next :: Monad m => Producer a m r -> m (Either r (a, Producer a m r)) + + Thus the @do@ block after the @FreeT@ constructor is in the base monad, e.g. 'IO' or 'Identity'; + the later subordinate block, opened by the @Free@ constructor, is in the @Producer@ monad: + +> words :: Monad m => Producer Text m r -> FreeT (Producer Text m) m r +> words p = FreeT $ do -- With 'next' we will inspect p's first chunk, excluding spaces; +> x <- next (p >-> dropWhile isSpace) -- note that 'dropWhile isSpace' is a pipe, and is thus *applied* with '>->'. +> return $ case x of -- We use 'return' and so need something of type 'FreeF (Text m) r (Texts m r)' +> Left r -> Pure r -- 'Left' means we got no Text chunk, but only the return value; so we are done. +> Right (txt, p') -> Free $ do -- If we get a chunk and the rest of the producer, p', we enter the 'Producer' monad +> p'' <- view (break isSpace) -- When we apply 'break isSpace', we get a Producer that returns a Producer; +> (yield txt >> p') -- so here we yield everything up to the next space, and get the rest back. +> return (words p'') -- We then carry on with the rest, which is likely to begin with space. + +-} diff --git a/examples/attoparser.hs b/examples/attoparser.hs new file mode 100644 index 0000000..ddf770d --- /dev/null +++ b/examples/attoparser.hs @@ -0,0 +1,21 @@ +import Pipes +import Pipes.Text.IO (fromHandle) +import Pipes.Attoparsec (parsed) +import qualified System.IO as IO + +data Test = Test { + a :: Int, + b :: Int + } deriving (Show) + +testParser :: Parser Test +testParser = do + a <- decimal + space + b <- decimal + endOfLine + return $ Test a b + +main = IO.withFile "./testfile" IO.ReadMode $ \handle -> runEffect $ + for test_parser (lift . print) + where (parsed (testParser <* endOfLine) (fromHandle handle)) \ No newline at end of file diff --git a/examples/decode.hs b/examples/decode.hs new file mode 100644 index 0000000..8cb44f8 --- /dev/null +++ b/examples/decode.hs @@ -0,0 +1,30 @@ +-- http://www.haskellforall.com/2014/02/pipes-parse-30-lens-based-parsing.html + +import Data.ByteString (ByteString) +import Data.Text (Text) +import Lens.Family.State.Strict (zoom) +import Pipes +import Pipes.Parse +import qualified Pipes.ByteString as ByteString +import qualified Pipes.Text as Text + +-- Retrieve all `Text` chunks up to 10 characters +parser :: Monad m => Parser ByteString m [Text] +parser = zoom (Text.decodeUtf8 . Text.splitAt 10) drawAll + +main = do + (textChunks, leftovers) <- runStateT parser ByteString.stdin + print textChunks + + -- Now print the remaining `ByteString` chunks + byteChunks <- evalStateT drawAll leftovers + print byteChunks +{- +$ ./decode +Hello, 世界!!! +["Hello, \19990\30028!"] +abcdefg + +["!!\n","abcdefg\n"] + +-} \ No newline at end of file diff --git a/examples/lines_url.hs b/examples/lines_url.hs new file mode 100644 index 0000000..b676656 --- /dev/null +++ b/examples/lines_url.hs @@ -0,0 +1,37 @@ +{-# LANGUAGE OverloadedStrings #-} +-- https://gist.github.com/michaelt/88e1fac12876857deefe +-- following +-- https://gist.github.com/gelisam/c769d186493221d7ebbe and associated controversy. + +module Main where + +import Prelude hiding (lines) +import Lens.Family +import Pipes +import Pipes.Group +import Pipes.HTTP +import Pipes.Text +import Pipes.Text.Encoding +import Pipes.Text.IO (toHandle,stdout) +import qualified System.IO as IO +import Data.Functor (void) +import qualified Data.Text as T + +main = do + req <- parseUrl "https://gist.github.com/gelisam/c769d186493221d7ebbe" + -- "http://www.example.com" + -- "http://www.gutenberg.org/files/10/10-h/10-h.htm" + withManager tlsManagerSettings $ \m -> + withHTTP req m $ \resp -> void $ runEffect $ + number_lines_of (responseBody resp ^. utf8 . lines) >-> toHandle IO.stdout + +number_lines_of :: Monad m => FreeT (Producer Text m) m bad -> Producer Text m bad +number_lines_of = number_loop (1 :: Int) where + number_loop n freeProducers = do + freeProducer <- lift $ runFreeT freeProducers + case freeProducer of + Pure badbytes -> do yield $ T.pack "\n" + return badbytes -- these could be inspected ... + Free p -> do yield $ T.pack ("\n" ++ show n ++ " ") + nextFreeProducers <- p + number_loop (n+1) nextFreeProducers diff --git a/examples/zoom.hs b/examples/zoom.hs new file mode 100644 index 0000000..3442dc8 --- /dev/null +++ b/examples/zoom.hs @@ -0,0 +1,152 @@ +-- this file illustrates several uses of `zoom` +-- one of them is quadratic in the length of the file +-- since it has to decode and encode repeatedly, +-- and is thus no good on long files. + +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE BangPatterns#-} +{-# LANGUAGE RankNTypes #-} +import Blaze.ByteString.Builder (Builder, fromByteString, toByteString) +import Control.Exception (Exception) +import Control.Monad.Trans.Class (lift) +import Data.ByteString (ByteString) +import qualified Data.ByteString as S +import qualified Data.ByteString.Lazy as L +import Data.Monoid +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TEE +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TLE + +import Pipes +import Pipes.Parse +import qualified Pipes.Prelude as PP +import qualified Pipes.ByteString as Bytes +import qualified Pipes.Text as Txt +import Pipes.Text.Encoding (utf8) + +import Control.Lens +import Control.Lens.Internal.Zoom +import Control.Monad +import qualified System.IO as IO +import Control.Monad.Trans.Maybe +import Control.Monad.State.Class + +main :: IO () +main = do -- S.writeFile fp $ contents 10000 -- 10000 cannot be handled fileParser0 and 1 + -- parse_file fileParser0 -- pathological + -- parse_file fileParser1 -- programs + parse_file fileParser2 -- good program + + where + parse_file parser = IO.withBinaryFile fp IO.ReadMode $ \h -> + do p' <- runEffect $ parseWith parser ( Bytes.fromHandle h ) >-> PP.print + runEffect $ p' >-> PP.print + parseWith parser = loop where + loop p = do (m,p') <- lift (runStateT (runMaybeT parser) p) + case m of Nothing -> return p' + Just file -> do yield file + loop p' + fp = "encoded.fileformat" + contents n = (toByteString . mconcat . replicate n . encodeFiles) input + <> S.pack (replicate 10 250) + + + +fileParser0, fileParser1, fileParser2 :: Monad m => MaybeT (StateT (Producer ByteString m x) m) File +fileParser0 = do (name, len) <- zoom utf8 parseText + contents <- zoom (Bytes.splitAt len) (lift drawAll) + return (File name (S.concat contents)) + where + -- this parser aggregates all Text parsing into one preliminary parser + -- which is then applied with `zoom utf8` + -- we cannot tell in advance how long, e.g. the file name will be + parseText :: Monad m => MaybeT (StateT (Producer Text m x) m) (Text, Int) + parseText = do nameLength <- parseNumber + names <- zoom (Txt.splitAt nameLength) $ (lift drawAll) + contentLength <- parseNumber + return $! (T.concat names, contentLength) + +-- here we disaggregate the little Text parsers but still apply them with `zoom utf8` +-- this makes no difference +fileParser1 = do nameLength <- zoom utf8 parseNumber + names <- zoom (utf8 . Txt.splitAt nameLength) (lift drawAll) + contentLength <- zoom utf8 parseNumber + contents <- zoom (Bytes.splitAt contentLength) (lift drawAll) + return (File (T.concat names) (S.concat contents)) + +-- this is the good program; be reflecting on the fact that file names +-- should not be a 1000 bytes long, and binary files longer than e.g. 10 ^ 10 +-- we can restrict the length of the byte stream to which we apply `zoom utf8` +fileParser2 = do nameLength <- Bytes.splitAt 3 ~~> utf8 ~~> parseNumber + names <- Bytes.splitAt nameLength ~~> utf8 ~~> lift drawAll + len <- Bytes.splitAt 10 ~~> utf8 ~~> parseNumber + contents <- Bytes.splitAt len ~~> lift drawAll + return (File (T.concat names) (S.concat contents)) + +-- infix lens nonsense +infixr 1 ~~> +(~~>) :: Zoom m n s t + => ((s -> Zoomed n c s) -> t -> Zoomed n c t) + -> m c -> n c +(~~>) = zoom +{-# INLINE (~~>) #-} + +parseNumber :: Monad m => MaybeT (StateT (Producer Text m x) m) Int +parseNumber = loop 0 where + loop !n = do c <- MaybeT Txt.drawChar + case c of ':' -> return n + _ -> do guard ('0' <= c && c <= '9') + loop $! n * 10 + (fromEnum c - fromEnum '0') + + + +-- --- Michael S's `File` type and its binary encoding, etc. + + +data File = File + { fileName :: !Text + , fileContents :: !ByteString + } + deriving Show + +encodeFile :: File -> Builder +encodeFile (File name contents) = + tellLength (S.length bytesname) <> + fromByteString bytesname <> + tellLength (S.length contents) <> + fromByteString contents + where + tellLength i = fromByteString $ TEE.encodeUtf8 (T.pack (shows i ":")) + bytesname = TEE.encodeUtf8 name + +encodeFiles :: [File] -> Builder +encodeFiles = mconcat . map encodeFile + +input :: [File] +input = + [ File "utf8.txt" $ TEE.encodeUtf8 "This file is in UTF-8" + , File "utf16.txt" $ TEE.encodeUtf16LE "This file is in UTF-16" + , File "binary.dat" "we'll pretend to be binary" + ] + + +--- + +-- This desperate scheme actually has some efficacy, if used before `utf8` in a zoom +-- but not much + +chunk :: Monad m => Int -> Lens' (Producer ByteString m r) (Producer ByteString m r) +chunk n = lens (chunkyN n) (\_ b -> b) where + + chunkyN :: Monad m => Int -> Producer ByteString m r -> Producer ByteString m r + chunkyN n = prod_loop where + + prod_loop p = do mbs <- lift $ next p + case mbs of Left r -> return r + Right (bs, p') -> do bs_loop bs + prod_loop p' + bs_loop bs = unless (S.null bs) $ do yield fore + unless (S.null aft) (bs_loop aft) + where (fore, aft) = S.splitAt n bs diff --git a/pipes-text.cabal b/pipes-text.cabal index 134d992..06e7ff5 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal @@ -52,4 +52,3 @@ library if !flag(noio) exposed-modules: Pipes.Text.IO build-depends: text >=0.11.3 && < 1.2 -