From 0ac0c414be4f9f20893112ed8ffa4d9cb6646061 Mon Sep 17 00:00:00 2001 From: michaelt Date: Mon, 17 Feb 2014 21:11:28 -0500 Subject: documentation overhaul continued --- Pipes/Text.hs | 75 +++++++++++------------------- Pipes/Text/Encoding.hs | 122 ++++++++++++++++++++++++++++++++++++++++++------- Pipes/Text/IO.hs | 104 ++++++++++++++++++++++++++++++----------- 3 files changed, 210 insertions(+), 91 deletions(-) diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 9bdacf9..9641256 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -1,45 +1,31 @@ {-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns, Trustworthy #-} -{-| This module provides @pipes@ utilities for \"text streams\", which are - streams of 'Text' chunks. The individual chunks are uniformly @strict@, but - a 'Producer' can be converted to and from lazy 'Text's, though this is generally - unwise. Where pipes IO replaces lazy IO, 'Producer Text m r' replaces lazy 'Text'. - An 'IO.Handle' can be associated with a 'Producer' or 'Consumer' according as it is read or written to. - - To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For - example, the following program copies a document from one file to another: - -> import Pipes -> import qualified Pipes.Text as Text -> import qualified Pipes.Text.IO as Text -> import System.IO -> -> main = -> withFile "inFile.txt" ReadMode $ \hIn -> -> withFile "outFile.txt" WriteMode $ \hOut -> -> runEffect $ Text.fromHandle hIn >-> Text.toHandle hOut - -To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): - -> import Pipes -> import qualified Pipes.Text as Text -> import qualified Pipes.Text.IO as Text -> import Pipes.Safe -> -> main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" - - You can stream to and from 'stdin' and 'stdout' using the predefined 'stdin' - and 'stdout' pipes, as with the following \"echo\" program: - -> main = runEffect $ Text.stdin >-> Text.stdout - - You can also translate pure lazy 'TL.Text's to and from pipes: - -> main = runEffect $ Text.fromLazy (TL.pack "Hello, world!\n") >-> Text.stdout - - In addition, this module provides many functions equivalent to lazy - 'Text' functions so that you can transform or fold text streams. For - example, to stream only the first three lines of 'stdin' to 'stdout' you +{-| This package provides @pipes@ utilities for \"text streams\", which are + streams of 'Text' chunks. The individual chunks are uniformly @strict@, and you + will generally want @Data.Text@ in scope. But the type @Producer Text m r@ is + in many ways the pipes equivalent of lazy @Text@ . + + This module provides many functions equivalent in one way or another to + the 'pure' functions in + . + They transform, divide, group and fold text streams. The functions + in this module are \'pure\' in the sense that they are uniformly monad-independent. + Simple IO operations are defined in + @Pipes.Text.IO@ -- as lazy IO @Text@ operations are in @Data.Text.Lazy.IO@ Interoperation + with @ByteString@ is provided in @Pipes.Text.Encoding@, which parallels @Data.Text.Lazy.Encoding@. + + The Text type exported by @Data.Text.Lazy@ is similar to '[Text]' + where the individual chunks are kept to a reasonable size; the user is not + aware of the divisions between the connected (strict) 'Text' chunks. + Similarly, functions in this module are designed to operate on streams that + are insensitive to text boundaries. This means that they may freely split + text into smaller texts, /discard empty texts/. However, the objective is that they should + /never concatenate texts/ in order to provide strict upper bounds on memory usage. + + One difference from @Data.Text.Lazy@ is that many of the operations are 'lensified'; + this has a number of advantages where it is possible, in particular it facilitate + their use with pipes-style 'Parser's of Text. + For example, to stream only the first three lines of 'stdin' to 'stdout' you might write: > import Pipes @@ -51,13 +37,7 @@ To stream from files, the following is perhaps more Prelude-like (note that it u > takeLines n = Text.unlines . Parse.takeFree n . Text.lines The above program will never bring more than one chunk of text (~ 32 KB) into - memory, no matter how long the lines are. - - Note that functions in this library are designed to operate on streams that - are insensitive to text boundaries. This means that they may freely split - text into smaller texts, /discard empty texts/. However, apart from the - special case of 'concatMap', they will /never concatenate texts/ in order - to provide strict upper bounds on memory usage -- with the single exception of 'concatMap'. + memory, no matter how long the lines are. -} module Pipes.Text ( @@ -97,7 +77,6 @@ module Pipes.Text ( , count -- * Primitive Character Parsers - -- $parse , nextChar , drawChar , unDrawChar diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs index e07c47e..a1a0113 100644 --- a/Pipes/Text/Encoding.hs +++ b/Pipes/Text/Encoding.hs @@ -1,16 +1,17 @@ {-# LANGUAGE RankNTypes, BangPatterns #-} --- | --- This module uses the stream decoding functions from the text-stream-decoding package --- to define decoding functions and lenses. +-- | This module uses the stream decoding functions from Michael Snoyman's new +-- +-- package to define decoding functions and lenses. module Pipes.Text.Encoding ( - -- * Lens type + -- * The Lens or Codec type -- $lenses Codec - -- * Standard lenses for viewing Text in ByteString + -- * Viewing the Text in a ByteString -- $codecs + , decode , utf8 , utf8Pure , utf16LE @@ -18,12 +19,20 @@ module Pipes.Text.Encoding , utf32LE , utf32BE -- * Non-lens decoding functions + -- $decoders , decodeUtf8 , decodeUtf8Pure , decodeUtf16LE , decodeUtf16BE , decodeUtf32LE , decodeUtf32BE + -- * Re-encoding functions + -- $encoders + , encodeUtf8 + , encodeUtf16LE + , encodeUtf16BE + , encodeUtf32LE + , encodeUtf32BE -- * Functions for latin and ascii text -- $ascii , encodeAscii @@ -33,6 +42,7 @@ module Pipes.Text.Encoding ) where +import Data.Functor.Constant (Constant(..)) import Data.Char (ord) import Data.ByteString as B import Data.ByteString (ByteString) @@ -49,16 +59,16 @@ import Pipes type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) {- $lenses - The 'Codec' type is just an aliased standard Prelude type. It just specializes - the @Lens'@ type synonymn used by the standard lens libraries, @lens@ and - @lens-families@ . You use them with - the @view@ or @(^.)@ and @zoom@ functions from those libraries. + The 'Codec' type is a simple specializion of + the @Lens'@ type synonymn used by the standard lens libraries, + and + . That type, - Each codec lens looks into a byte stream that is understood to contain text. - The stream of text it 'sees' in the stream of bytes begins at its head; it ends - by reverting to (returning) the original byte stream - beginning at the point of decoding failure. Where there is no decoding failure, - it returns an empty byte stream with its return value. +> type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) + + is just an alias for an ordinary Prelude type. Thus you use any codec with + the @view@ / @(^.)@ and @zoom@ functions from those libraries. + -} type Codec @@ -66,6 +76,17 @@ type Codec . Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) + +{- | 'decode' is just the ordinary @view@ or @(^.)@ of the lens libraries; + exported here for convience + +> decode utf8 p = decodeUtf8 p = view utf8 p = p ^. utf + +-} + +decode :: ((b -> Constant b b) -> (a -> Constant b a)) -> a -> b +decode codec a = getConstant (codec Constant a) + decodeStream :: Monad m => (B.ByteString -> DecodeResult) @@ -82,7 +103,20 @@ decodeStream = loop where p') {-# INLINABLE decodeStream#-} +{- $decoders + These are functions with the simple type: + +> decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) + + Thus in general + +> decodeUtf8 = view utf8 +> decodeUtf16LE = view utf16LE + and so forth, but these forms + may be more convenient (and give better type errors!) where lenses are + not desired. +-} decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) @@ -109,6 +143,34 @@ decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer decodeUtf32BE = decodeStream streamUtf32BE {-# INLINE decodeUtf32BE #-} + +{- $encoders + These are simply defined + +> encodeUtf8 = yield . TE.encodeUtf8 + + They are intended for use with 'for' + +> for Text.stdin encodeUtf8 :: Producer ByteString IO () + + which would have the effect of + +> Text.stdin >-> Pipes.Prelude.map (TE.encodeUtf8) + + using the encoding functions from Data.Text.Encoding +-} + +encodeUtf8 :: Monad m => Text -> Producer ByteString m () +encodeUtf8 = yield . TE.encodeUtf8 +encodeUtf16LE :: Monad m => Text -> Producer ByteString m () +encodeUtf16LE = yield . TE.encodeUtf16LE +encodeUtf16BE :: Monad m => Text -> Producer ByteString m () +encodeUtf16BE = yield . TE.encodeUtf16BE +encodeUtf32LE :: Monad m => Text -> Producer ByteString m () +encodeUtf32LE = yield . TE.encodeUtf32LE +encodeUtf32BE :: Monad m => Text -> Producer ByteString m () +encodeUtf32BE = yield . TE.encodeUtf32BE + mkCodec :: (forall r m . Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r )) -> (Text -> ByteString) @@ -118,11 +180,39 @@ mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) {- $codecs - The particular \'Codec\' lenses are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. + Each codec/lens looks into a byte stream that is supposed to contain text. + The particular \'Codec\' lenses are named in accordance with the expected + encoding, 'utf8', 'utf16LE' etc. @view@ / @(^.)@ -- here also called 'decode' -- + turns a Codec into a function: > view utf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) +> decode utf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) > Bytes.stdin ^. utf8 :: Producer Text IO (Producer ByteString IO r) + Uses of a codec with @view@ / @(^.)@ / 'decode' can always be replaced by the specialized + decoding functions exported here, e.g. + +> decodeUtf8 :: Producer ByteString m r -> Producer Text m (Producer ByteString m r) +> decodeUtf8 Byte.stdin :: Producer Text IO (Producer ByteString IO r) + + The stream of text a @Codec@ \'sees\' in the stream of bytes begins at its head. + At any point of decoding failure, the stream of text ends and reverts to (returns) + the original byte stream. Thus if the first bytes are already + un-decodable, the whole ByteString producer will be returned, i.e. + +> view utf8 bytestream + + will just come to the same as + +> return bytestream + + Where there is no decoding failure, the return value of the text stream will be + an empty byte stream followed by its own return value. In all cases you must + deal with the fact that it is a ByteString producer that is returned, even if + it can be thrown away with @Control.Monad.void@ + +> void (Bytes.stdin ^. utf8) :: Producer Text IO () + @zoom@ converts a Text parser into a ByteString parser: > zoom utf8 drawChar :: Monad m => StateT (Producer ByteString m r) m (Maybe Char) @@ -165,7 +255,7 @@ utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE -} --- 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, +-- | 'encodeAscii' reduces as much of your stream of 'Text' actually is ascii to a byte stream, -- returning the rest of the 'Text' at the first non-ascii 'Char' encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs index 92500c3..45a1467 100644 --- a/Pipes/Text/IO.hs +++ b/Pipes/Text/IO.hs @@ -1,34 +1,14 @@ {-#LANGUAGE RankNTypes#-} --- | The operations exported here are a convenience, like the similar operations in --- @Data.Text.IO@ , or rather, @Data.Text.Lazy.IO@, since @Producer Text m r@ is --- 'effectful text' and something like the pipes equivalent of lazy Text. --- --- * Like the functions in @Data.Text.IO@, they attempt to work with the system encoding. --- --- * Like the functions in @Data.Text.IO@, they are slower than ByteString operations. Where --- you know what encoding you are working with, use @Pipes.ByteString@ and @Pipes.Text.Encoding@ instead, --- e.g. @view utf8 Bytes.stdin@ instead of @Text.stdin@ --- --- * Like the functions in @Data.Text.IO@ , they use Text exceptions. --- --- Something like --- --- > view utf8 . Bytes.fromHandle :: Handle -> Producer Text IO (Producer ByteString m ()) --- --- yields a stream of Text, and follows --- standard pipes protocols by reverting to (i.e. returning) the underlying byte stream --- upon reaching any decoding error. (See especially the pipes-binary package.) --- --- By contrast, something like --- --- > Text.fromHandle :: Handle -> Producer Text IO () --- --- supplies a stream of text returning '()', which is convenient for many tasks, --- but violates the pipes @pipes-binary@ approach to decoding errors and --- throws an exception of the kind characteristic of the @text@ library instead. + module Pipes.Text.IO ( + -- * Text IO + -- $textio + + -- * Caveats + -- $caveats + -- * Producers fromHandle , stdin @@ -52,6 +32,76 @@ import qualified Pipes.Safe as Safe import Pipes.Safe (MonadSafe(..), Base(..)) import Prelude hiding (readFile, writeFile) +{- $textio + Where pipes IO replaces lazy IO, @Producer Text m r@ replaces lazy 'Text'. + This module exports some convenient functions for producing and consuming + pipes 'Text' in IO, with caveats described below. The main points are as in + + + An 'IO.Handle' can be associated with a 'Producer' or 'Consumer' according as it is read or written to. + + To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For + example, the following program copies a document from one file to another: + +> import Pipes +> import qualified Pipes.Text as Text +> import qualified Pipes.Text.IO as Text +> import System.IO +> +> main = +> withFile "inFile.txt" ReadMode $ \hIn -> +> withFile "outFile.txt" WriteMode $ \hOut -> +> runEffect $ Text.fromHandle hIn >-> Text.toHandle hOut + +To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): + +> import Pipes +> import qualified Pipes.Text as Text +> import qualified Pipes.Text.IO as Text +> import Pipes.Safe +> +> main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" + + You can stream to and from 'stdin' and 'stdout' using the predefined 'stdin' + and 'stdout' pipes, as with the following \"echo\" program: + +> main = runEffect $ Text.stdin >-> Text.stdout + +-} + + +{- $caveats + + The operations exported here are a convenience, like the similar operations in + @Data.Text.IO@ (or rather, @Data.Text.Lazy.IO@, since, again, @Producer Text m r@ is + 'effectful text' and something like the pipes equivalent of lazy Text.) + + * Like the functions in @Data.Text.IO@, they attempt to work with the system encoding. + + * Like the functions in @Data.Text.IO@, they are slower than ByteString operations. Where + you know what encoding you are working with, use @Pipes.ByteString@ and @Pipes.Text.Encoding@ instead, + e.g. @view utf8 Bytes.stdin@ instead of @Text.stdin@ + + * Like the functions in @Data.Text.IO@ , they use Text exceptions. + + Something like + +> view utf8 . Bytes.fromHandle :: Handle -> Producer Text IO (Producer ByteString m ()) + + yields a stream of Text, and follows + standard pipes protocols by reverting to (i.e. returning) the underlying byte stream + upon reaching any decoding error. (See especially the pipes-binary package.) + + By contrast, something like + +> Text.fromHandle :: Handle -> Producer Text IO () + + supplies a stream of text returning '()', which is convenient for many tasks, + but violates the pipes @pipes-binary@ approach to decoding errors and + throws an exception of the kind characteristic of the @text@ library instead. + + +-} {-| Convert a 'IO.Handle' into a text stream using a text size determined by the good sense of the text library. Note with the remarks -- cgit v1.2.3