X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;ds=sidebyside;f=Pipes%2FText.hs;h=bbf200f9e2ec8e72b8439744a1c0211fd96529ca;hb=167f880504e05b19d3487c7ba701afa9633a2f41;hp=74576e8e92225985e9a2af2ef13bdf286fbdcad3;hpb=9e9bb0ce4c803486a724b10ad1bc3a76770b5a9f;p=github%2Ffretlink%2Ftext-pipes.git diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 74576e8..bbf200f 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -66,11 +66,9 @@ module Pipes.Text ( , stdin , fromHandle , readFile - , stdinLn -- * Consumers , stdout - , stdoutLn , toHandle , writeFile @@ -123,19 +121,33 @@ module Pipes.Text ( , group , word , line + + -- * Decoding Lenses , decodeUtf8 - , decode + , codec + + -- * Codecs + , utf8 + , utf16_le + , utf16_be + , utf32_le + , utf32_be + + -- * Other Decoding/Encoding Functions + , decodeIso8859_1 + , decodeAscii + , encodeIso8859_1 + , encodeAscii -- * FreeT Splitters , chunksOf , splitsWith - , split + , splits -- , groupsBy -- , groups , lines , words - -- * Transformations , intersperse , packChars @@ -152,9 +164,12 @@ module Pipes.Text ( , module Data.Profunctor , module Data.Word , module Pipes.Parse + , module Pipes.Group + , module Pipes.Text.Internal ) where import Control.Exception (throwIO, try) +import Control.Applicative ((<*)) import Control.Monad (liftM, unless, join) import Control.Monad.Trans.State.Strict (StateT(..), modify) import Data.Monoid ((<>)) @@ -169,6 +184,7 @@ import Data.Text.Lazy.Internal (foldrChunks, defaultChunkSize) import Data.ByteString.Unsafe (unsafeTake, unsafeDrop) import Data.ByteString (ByteString) import qualified Data.ByteString as B +import qualified Data.ByteString.Char8 as B8 import Data.Char (ord, isSpace) import Data.Functor.Constant (Constant(Constant, getConstant)) import Data.Functor.Identity (Identity) @@ -179,13 +195,13 @@ import Foreign.C.Error (Errno(Errno), ePIPE) import qualified GHC.IO.Exception as G import Pipes import qualified Pipes.ByteString as PB -import qualified Pipes.Text.Internal as PE -import Pipes.Text.Internal (Codec(..)) --- import Pipes.Text.Parse (nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) - +import qualified Pipes.Text.Internal as PI +import Pipes.Text.Internal import Pipes.Core (respond, Server') +import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) +import qualified Pipes.Group as PG import qualified Pipes.Parse as PP -import Pipes.Parse (Parser, concats, intercalates, FreeT(..)) +import Pipes.Parse (Parser) import qualified Pipes.Safe.Prelude as Safe import qualified Pipes.Safe as Safe import Pipes.Safe (MonadSafe(..), Base(..)) @@ -242,8 +258,8 @@ stdin = fromHandle IO.stdin fromHandle :: MonadIO m => IO.Handle -> Producer Text m () fromHandle h = go where go = do txt <- liftIO (T.hGetChunk h) - unless (T.null txt) $ do yield txt - go + unless (T.null txt) ( do yield txt + go ) {-# INLINABLE fromHandle#-} @@ -257,32 +273,13 @@ readFile :: MonadSafe m => FilePath -> Producer Text m () readFile file = Safe.withFile file IO.ReadMode fromHandle {-# INLINE readFile #-} -{-| Stream lines of text from stdin (for testing in ghci etc.) - ->>> let safely = runSafeT . runEffect ->>> safely $ for Text.stdinLn (lift . lift . print . T.length) -hello -5 -world -5 - --} -stdinLn :: MonadIO m => Producer' Text m () -stdinLn = go where - go = do - eof <- liftIO (IO.hIsEOF IO.stdin) - unless eof $ do - txt <- liftIO (T.hGetLine IO.stdin) - yield txt - go -{-# INLINABLE stdinLn #-} {-| Stream text to 'stdout' Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. - Note: For best performance, use @(for source (liftIO . putStr))@ instead of - @(source >-> stdout)@ in suitable cases. + Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ + instead of @(source >-> stdout)@ . -} stdout :: MonadIO m => Consumer' Text m () stdout = go @@ -299,20 +296,6 @@ stdout = go Right () -> go {-# INLINABLE stdout #-} -stdoutLn :: (MonadIO m) => Consumer' Text m () -stdoutLn = go - where - go = do - str <- await - x <- liftIO $ try (T.putStrLn str) - case x of - Left (G.IOError { G.ioe_type = G.ResourceVanished - , G.ioe_errno = Just ioe }) - | Errno ioe == ePIPE - -> return () - Left e -> liftIO (throwIO e) - Right () -> go -{-# INLINABLE stdoutLn #-} {-| Convert a text stream into a 'Handle' @@ -390,8 +373,8 @@ unpack = for cat (\t -> yield (T.unpack t)) p >-> unpack = for p (\txt -> yield (T.unpack txt)) #-} --- | @toCaseFold@, @toLower@, @toUpper@ and @stripStart@ are standard 'Text' utility, --- here acting on a 'Text' pipe, rather as they would on a lazy text +-- | @toCaseFold@, @toLower@, @toUpper@ and @stripStart@ are standard 'Text' utilities, +-- here acting as 'Text' pipes, rather as they would on a lazy text toCaseFold :: Monad m => Pipe Text Text m () toCaseFold = P.map T.toCaseFold {-# INLINEABLE toCaseFold #-} @@ -426,7 +409,8 @@ stripStart = do let text = T.stripStart chunk if T.null text then stripStart - else cat + else do yield text + cat {-# INLINEABLE stripStart #-} -- | @(take n)@ only allows @n@ individual characters to pass; @@ -702,27 +686,25 @@ isEndOfChars = do {-# INLINABLE isEndOfChars #-} - - - --- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded into a Pipe of Text --- returning a Pipe of ByteStrings that begins at the point of failure. +{- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated + stream of Text ends by returning a stream of ByteStrings beginning at the point of failure. + -} decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) - (k (go B.empty PE.streamDecodeUtf8 p0)) where + (k (go B.empty PI.streamDecodeUtf8 p0)) where go !carry dec0 p = do x <- lift (next p) - case x of Left r -> if B.null carry - then return (return r) -- all bytestrinput was consumed - else return (do yield carry -- a potentially valid fragment remains - return r) + case x of Left r -> return (if B.null carry + then return r -- all bytestring input was consumed + else (do yield carry -- a potentially valid fragment remains + return r)) Right (chunk, p') -> case dec0 chunk of - PE.Some text carry2 dec -> do yield text + PI.Some text carry2 dec -> do yield text go carry2 dec p' - PE.Other text bs -> do yield text + PI.Other text bs -> do yield text return (do yield bs -- an invalid blob remains p') {-# INLINABLE decodeUtf8 #-} @@ -753,21 +735,6 @@ splitAt n0 k p0 = fmap join (k (go n0 p0)) return (yield suffix >> p') {-# INLINABLE splitAt #-} --- | Split a text stream into 'FreeT'-delimited text streams of fixed size -chunksOf - :: (Monad m, Integral n) - => n -> Lens' (Producer Text m r) - (FreeT (Producer Text m) m r) -chunksOf n k p0 = fmap concats (k (FreeT (go p0))) - where - go p = do - x <- next p - return $ case x of - Left r -> PP.Pure r - Right (txt, p') -> PP.Free $ do - p'' <- (yield txt >> p') ^. splitAt n - return $ PP.FreeT (go p'') -{-# INLINABLE chunksOf #-} {-| Split a text stream in two, where the first text stream is the longest consecutive group of text that satisfy the predicate @@ -882,7 +849,7 @@ packChars :: Monad m => Iso' (Producer Char m x) (Producer Text m x) packChars = Data.Profunctor.dimap to (fmap from) where -- to :: Monad m => Producer Char m x -> Producer Text m x - to p = PP.folds step id done (p^.PP.chunksOf defaultChunkSize) + to p = PG.folds step id done (p^.PG.chunksOf defaultChunkSize) step diffAs c = diffAs . (c:) @@ -892,6 +859,24 @@ packChars = Data.Profunctor.dimap to (fmap from) from p = for p (each . T.unpack) {-# INLINABLE packChars #-} + +-- | Split a text stream into 'FreeT'-delimited text streams of fixed size +chunksOf + :: (Monad m, Integral n) + => n -> Lens' (Producer Text m r) + (FreeT (Producer Text m) m r) +chunksOf n k p0 = fmap concats (k (FreeT (go p0))) + where + go p = do + x <- next p + return $ case x of + Left r -> Pure r + Right (txt, p') -> Free $ do + p'' <- (yield txt >> p') ^. splitAt n + return $ FreeT (go p'') +{-# INLINABLE chunksOf #-} + + {-| Split a text stream into sub-streams delimited by characters that satisfy the predicate -} @@ -899,80 +884,113 @@ splitsWith :: (Monad m) => (Char -> Bool) -> Producer Text m r - -> PP.FreeT (Producer Text m) m r -splitsWith predicate p0 = PP.FreeT (go0 p0) + -> FreeT (Producer Text m) m r +splitsWith predicate p0 = FreeT (go0 p0) where go0 p = do x <- next p case x of - Left r -> return (PP.Pure r) + Left r -> return (Pure r) Right (txt, p') -> if (T.null txt) then go0 p' - else return $ PP.Free $ do + else return $ Free $ do p'' <- (yield txt >> p') ^. span (not . predicate) - return $ PP.FreeT (go1 p'') + return $ FreeT (go1 p'') go1 p = do x <- nextChar p return $ case x of - Left r -> PP.Pure r - Right (_, p') -> PP.Free $ do + Left r -> Pure r + Right (_, p') -> Free $ do p'' <- p' ^. span (not . predicate) - return $ PP.FreeT (go1 p'') + return $ FreeT (go1 p'') {-# INLINABLE splitsWith #-} -- | Split a text stream using the given 'Char' as the delimiter -split :: (Monad m) +splits :: (Monad m) => Char - -> Producer Text m r - -> FreeT (Producer Text m) m r -split c = splitsWith (c ==) -{-# INLINABLE split #-} + -> Lens' (Producer Text m r) + (FreeT (Producer Text m) m r) +splits c k p = + fmap (PG.intercalates (yield (T.singleton c))) (k (splitsWith (c ==) p)) +{-# INLINABLE splits #-} + +{-| Isomorphism between a stream of 'Text' and groups of equivalent 'Char's , using the + given equivalence relation +-} +groupsBy + :: Monad m + => (Char -> Char -> Bool) + -> Lens' (Producer Text m x) (FreeT (Producer Text m) m x) +groupsBy equals k p0 = fmap concats (k (FreeT (go p0))) where + go p = do x <- next p + case x of Left r -> return (Pure r) + Right (bs, p') -> case T.uncons bs of + Nothing -> go p' + Just (c, _) -> do return $ Free $ do + p'' <- (yield bs >> p')^.span (equals c) + return $ FreeT (go p'') +{-# INLINABLE groupsBy #-} + + +-- | Like 'groupsBy', where the equality predicate is ('==') +groups + :: Monad m + => Lens' (Producer Text m x) (FreeT (Producer Text m) m x) +groups = groupsBy (==) +{-# INLINABLE groups #-} + {-| Split a text stream into 'FreeT'-delimited lines -} lines - :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r -lines p0 = PP.FreeT (go0 p0) + :: (Monad m) => Iso' (Producer Text m r) (FreeT (Producer Text m) m r) +lines = Data.Profunctor.dimap _lines (fmap _unlines) where - go0 p = do - x <- next p - case x of - Left r -> return (PP.Pure r) - Right (txt, p') -> - if (T.null txt) - then go0 p' - else return $ PP.Free $ go1 (yield txt >> p') - go1 p = do - p' <- p ^. break ('\n' ==) - return $ PP.FreeT $ do - x <- nextChar p' - case x of - Left r -> return $ PP.Pure r - Right (_, p'') -> go0 p'' -{-# INLINABLE lines #-} + _lines p0 = FreeT (go0 p0) + where + go0 p = do + x <- next p + case x of + Left r -> return (Pure r) + Right (txt, p') -> + if (T.null txt) + then go0 p' + else return $ Free $ go1 (yield txt >> p') + go1 p = do + p' <- p ^. break ('\n' ==) + return $ FreeT $ do + x <- nextChar p' + case x of + Left r -> return $ Pure r + Right (_, p'') -> go0 p'' + -- _unlines + -- :: Monad m + -- => FreeT (Producer Text m) m x -> Producer Text m x + _unlines = concats . PG.maps (<* yield (T.singleton '\n')) + +{-# INLINABLE lines #-} -- | Split a text stream into 'FreeT'-delimited words words - :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r -words = go + :: (Monad m) => Iso' (Producer Text m r) (FreeT (Producer Text m) m r) +words = Data.Profunctor.dimap go (fmap _unwords) where - go p = PP.FreeT $ do + go p = FreeT $ do x <- next (p >-> dropWhile isSpace) return $ case x of - Left r -> PP.Pure r - Right (bs, p') -> PP.Free $ do + Left r -> Pure r + Right (bs, p') -> Free $ do p'' <- (yield bs >> p') ^. break isSpace return (go p'') + _unwords = PG.intercalates (yield $ T.singleton ' ') + {-# INLINABLE words #-} - - - {-| 'intercalate' concatenates the 'FreeT'-delimited text streams after interspersing a text stream in between them -} @@ -984,17 +1002,17 @@ intercalate intercalate p0 = go0 where go0 f = do - x <- lift (PP.runFreeT f) + x <- lift (runFreeT f) case x of - PP.Pure r -> return r - PP.Free p -> do + Pure r -> return r + Free p -> do f' <- p go1 f' go1 f = do - x <- lift (PP.runFreeT f) + x <- lift (runFreeT f) case x of - PP.Pure r -> return r - PP.Free p -> do + Pure r -> return r + Free p -> do p0 f' <- p go1 f' @@ -1007,10 +1025,10 @@ unlines unlines = go where go f = do - x <- lift (PP.runFreeT f) + x <- lift (runFreeT f) case x of - PP.Pure r -> return r - PP.Free p -> do + Pure r -> return r + Free p -> do f' <- p yield $ T.singleton '\n' go f' @@ -1020,7 +1038,7 @@ unlines = go -} unwords :: (Monad m) => FreeT (Producer Text m) m r -> Producer Text m r -unwords = intercalate (yield $ T.pack " ") +unwords = intercalate (yield $ T.singleton ' ') {-# INLINABLE unwords #-} {- $parse @@ -1029,50 +1047,112 @@ unwords = intercalate (yield $ T.pack " ") -} {- $reexports - @Pipes.Text.Parse@ re-exports 'nextChar', 'drawChar', 'unDrawChar', 'peekChar', and 'isEndOfChars'. @Data.Text@ re-exports the 'Text' type. - @Pipes.Parse@ re-exports 'input', 'concat', and 'FreeT' (the type). + @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. -} - - -decode :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) --- decode codec = go B.empty where --- go extra p0 = --- do x <- lift (next p0) --- case x of Right (chunk, p) -> --- do let (text, stuff) = codecDecode codec (B.append extra chunk) --- yield text --- case stuff of Right extra' -> go extra' p --- Left (exc,bs) -> do yield text --- return (do yield bs --- p) --- Left r -> return (do yield extra --- return r) - -decode d p0 = case d of - PE.Other txt bad -> do yield txt - return (do yield bad - p0) - PE.Some txt extra dec -> do yield txt - x <- lift (next p0) - case x of Left r -> return (do yield extra - return r) - Right (chunk,p1) -> decode (dec chunk) p1 - --- go !carry dec0 p = do --- x <- lift (next p) --- case x of Left r -> if B.null carry --- then return (return r) -- all bytestrinput was consumed --- else return (do yield carry -- a potentially valid fragment remains --- return r) --- --- Right (chunk, p') -> case dec0 chunk of --- PE.Some text carry2 dec -> do yield text --- go carry2 dec p' --- PE.Other text bs -> do yield text --- return (do yield bs -- an invalid blob remains --- p') --- {-# INLINABLE decodeUtf8 #-} +{- | Use a 'Codec' as a pipes-style 'Lens' into a byte stream; the available 'Codec' s are + 'utf8', 'utf16_le', 'utf16_be', 'utf32_le', 'utf32_be' . The 'Codec' concept and the + individual 'Codec' definitions follow the enumerator and conduit libraries. + + Utf8 is handled differently in this library -- without the use of 'unsafePerformIO' &co + to catch 'Text' exceptions; but the same 'mypipe ^. codec utf8' interface can be used. + 'mypipe ^. decodeUtf8' should be the same, but has a somewhat more direct and thus perhaps + better implementation. + + -} +codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) +codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) + (k (decoder (dec B.empty) p0) ) where + decoder :: Monad m => PI.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) + decoder !d p0 = case d of + PI.Other txt bad -> do yield txt + return (do yield bad + p0) + PI.Some txt extra dec -> do yield txt + x <- lift (next p0) + case x of Left r -> return (do yield extra + return r) + Right (chunk,p1) -> decoder (dec chunk) p1 + +{- | ascii and latin encodings only represent a small fragment of 'Text'; thus we cannot + use the pipes 'Lens' style to work with them. Rather we simply define functions + each way. + + 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, + returning the rest of the 'Text' at the first non-ascii 'Char' +-} +encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeAscii = go where + go p = do echunk <- lift (next p) + case echunk of + Left r -> return (return r) + Right (chunk, p') -> + if T.null chunk + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' +{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, + returning the rest of the 'Text' upon hitting any non-latin 'Char' + -} +encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) +encodeIso8859_1 = go where + go p = do etxt <- lift (next p) + case etxt of + Left r -> return (return r) + Right (txt, p') -> + if T.null txt + then go p' + else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt + in do yield (B8.pack (T.unpack safe)) + if T.null unsafe + then go p' + else return $ do yield unsafe + p' + +{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the + unused 'ByteString' upon hitting an un-ascii byte. + -} +decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeAscii = go where + go p = do echunk <- lift (next p) + case echunk of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else let (safe, unsafe) = B.span (<= 0x7F) chunk + in do yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return $ do yield unsafe + p' + +{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the + unused 'ByteString' upon hitting the rare un-latinizable byte. + -} +decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) +decodeIso8859_1 = go where + go p = do echunk <- lift (next p) + case echunk of + Left r -> return (return r) + Right (chunk, p') -> + if B.null chunk + then go p' + else let (safe, unsafe) = B.span (<= 0xFF) chunk + in do yield (T.pack (B8.unpack safe)) + if B.null unsafe + then go p' + else return $ do yield unsafe + p' + + + + + \ No newline at end of file