decoding pipes added

[github/fretlink/text-pipes.git] / Pipes / Text.hs
diff --git a/Pipes/Text.hs b/Pipes/Text.hs

index d811ab5fd704ba59a6f290d20981cdf85be39735..04c509fa62d1a1ac2787e43be63985179f52b0c1 100644 (file)
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -2,9 +2,10 @@
  
  {-| This module provides @pipes@ utilities for \"text streams\", which are
      streams of 'Text' chunks.  The individual chunks are uniformly @strict@, but 
  
  {-| This module provides @pipes@ utilities for \"text streams\", which are
      streams of 'Text' chunks.  The individual chunks are uniformly @strict@, but 
-    can interact lazy 'Text's  and 'IO.Handle's.
+    a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can
+    be associated with a 'Producer' or 'Consumer' according as it is read or written to.
  
  
-    To stream to or from 'IO.Handle's, use 'fromHandle' or 'toHandle'.  For
+    To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'.  For
      example, the following program copies a document from one file to another:
  
  > import Pipes
      example, the following program copies a document from one file to another:
  
  > import Pipes
@@ -80,8 +81,15 @@ module Pipes.Text  (
      filter,
      scan,
      encodeUtf8,
      filter,
      scan,
      encodeUtf8,
+#if MIN_VERSION_text(0,11,4)
+    pipeDecodeUtf8,
+    pipeDecodeUtf8With,
+#endif
      pack,
      unpack,
      pack,
      unpack,
+    toCaseFold,
+    toLower,
+    toUpper,
      stripStart,
  
      -- * Folds
      stripStart,
  
      -- * Folds
@@ -98,8 +106,6 @@ module Pipes.Text  (
      minimum,
      find,
      index,
      minimum,
      find,
      index,
---    elemIndex,
---    findIndex,
      count,
  
      -- * Splitters
      count,
  
      -- * Splitters
@@ -115,6 +121,7 @@ module Pipes.Text  (
      words,
  #if MIN_VERSION_text(0,11,4)
      decodeUtf8,
      words,
  #if MIN_VERSION_text(0,11,4)
      decodeUtf8,
+    decodeUtf8With,
  #endif
      -- * Transformations
      intersperse,
  #endif
      -- * Transformations
      intersperse,
@@ -140,10 +147,11 @@ module Pipes.Text  (
  
  import Control.Exception (throwIO, try)
  import Control.Monad (liftM, unless)
  
  import Control.Exception (throwIO, try)
  import Control.Monad (liftM, unless)
-import Control.Monad.Trans.State.Strict (StateT)
+import Control.Monad.Trans.State.Strict (StateT(..))
  import qualified Data.Text as T
  import qualified Data.Text.IO as T
  import qualified Data.Text.Encoding as TE
  import qualified Data.Text as T
  import qualified Data.Text.IO as T
  import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Encoding.Error as TE
  import Data.Text (Text)
  import qualified Data.Text.Lazy as TL
  import qualified Data.Text.Lazy.IO as TL
  import Data.Text (Text)
  import qualified Data.Text.Lazy as TL
  import qualified Data.Text.Lazy.IO as TL
@@ -169,6 +177,7 @@ import Pipes.Safe (MonadSafe(..), Base(..))
  import qualified Pipes.Prelude as P
  import qualified System.IO as IO
  import Data.Char (isSpace)
  import qualified Pipes.Prelude as P
  import qualified System.IO as IO
  import Data.Char (isSpace)
+import Data.Word (Word8)
  import Prelude hiding (
      all,
      any,
  import Prelude hiding (
      all,
      any,
@@ -313,7 +322,8 @@ concatMap f = P.map (T.concatMap f)
  
  
  -- | Transform a Pipe of 'Text' into a Pipe of 'ByteString's using UTF-8
  
  
  -- | Transform a Pipe of 'Text' into a Pipe of 'ByteString's using UTF-8
--- encoding
+-- encoding; @encodeUtf8 = Pipes.Prelude.map TE.encodeUtf8@ so more complex
+-- encoding pipes can easily be constructed with the functions in @Data.Text.Encoding@
  encodeUtf8 :: Monad m => Pipe Text ByteString m r
  encodeUtf8 = P.map TE.encodeUtf8
  {-# INLINEABLE encodeUtf8 #-}
  encodeUtf8 :: Monad m => Pipe Text ByteString m r
  encodeUtf8 = P.map TE.encodeUtf8
  {-# INLINEABLE encodeUtf8 #-}
@@ -539,19 +549,7 @@ index
  index n p = head (p >-> drop n)
  {-# INLINABLE index #-}
  
  index n p = head (p >-> drop n)
  {-# INLINABLE index #-}
  
--- | Find the index of an element that matches the given 'Char'
--- elemIndex
---     :: (Monad m, Num n) => Char -> Producer Text m () -> m (Maybe n)
--- elemIndex w8 = findIndex (w8 ==)
--- {-# INLINABLE elemIndex #-}
-
--- | Store the first index of an element that satisfies the predicate
--- findIndex
---     :: (Monad m, Num n)
---     => (Char -> Bool) -> Producer Text m () -> m (Maybe n)
--- findIndex predicate p = P.head (p >-> findIndices predicate)
--- {-# INLINABLE findIndex #-}
--- 
+
  -- | Store a tally of how many segments match the given 'Text'
  count :: (Monad m, Num n) => Text -> Producer Text m () -> m n
  count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
  -- | Store a tally of how many segments match the given 'Text'
  count :: (Monad m, Num n) => Text -> Producer Text m () -> m n
  count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
@@ -578,6 +576,52 @@ decodeUtf8 = go TE.streamDecodeUtf8
                            yield l
                            p'
  {-# INLINEABLE decodeUtf8 #-}
                            yield l
                            p'
  {-# INLINEABLE decodeUtf8 #-}
+
+-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
+-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
+-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
+decodeUtf8With 
+  :: Monad m  
+  => TE.OnDecodeError 
+  -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
+  where go dec p = do
+            x <- lift (next p)
+            case x of
+                Left r -> return (return r)
+                Right (chunk, p') -> do
+                    let TE.Some text l dec' = dec chunk
+                    if B.null l
+                      then do
+                          yield text
+                          go dec' p'
+                      else return $ do
+                          yield l
+                          p'
+{-# INLINEABLE decodeUtf8With #-}
+
+-- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise
+-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise
+-- only few bytes will only be moved from one chunk to the next before decoding.
+pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r
+pipeDecodeUtf8 = go TE.streamDecodeUtf8
+  where go dec = do chunk <- await
+                    case dec chunk of 
+                      TE.Some text l dec' -> do yield text
+                                                go dec'
+{-# INLINEABLE pipeDecodeUtf8 #-}
+
+-- | A simple pipe from 'ByteString' to 'Text' using a replacement function.
+pipeDecodeUtf8With 
+  :: Monad m  
+  => TE.OnDecodeError 
+  -> Pipe ByteString Text m r 
+pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
+  where go dec = do chunk <- await
+                    case dec chunk of 
+                      TE.Some text l dec' -> do yield text
+                                                go dec'
+{-# INLINEABLE pipeDecodeUtf8With #-}
  #endif
  
  -- | Splits a 'Producer' after the given number of characters
  #endif
  
  -- | Splits a 'Producer' after the given number of characters
@@ -752,18 +796,22 @@ words
      :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r
  words p0 = removeEmpty (splitWith isSpace p0)
    where
      :: (Monad m) => Producer Text m r -> FreeT (Producer Text m) m r
  words p0 = removeEmpty (splitWith isSpace p0)
    where
-    removeEmpty f = PP.FreeT $ do
-        x <- PP.runFreeT f
-        case x of
-            PP.Pure r -> return (PP.Pure r)
-            PP.Free p -> do
-                y <- next p
-                case y of
-                    Left   f'      -> PP.runFreeT (removeEmpty f')
-                    Right (bs, p') -> return $ PP.Free $ do
-                        yield bs
-                        f' <- p'
-                        return (removeEmpty f')
+  removeEmpty f = PP.FreeT $ do
+    x <- PP.runFreeT f
+    case x of 
+        PP.Pure r -> return (PP.Pure r)
+        PP.Free p -> loop p
+  loop p = do 
+    y <- next p
+    case y of
+        Left   f'       -> PP.runFreeT (removeEmpty f')
+        Right (txt, p') -> 
+          if T.null txt 
+             then loop p'
+             else return $ PP.Free $ do
+                            yield txt
+                            f' <- p'
+                            return (removeEmpty f')
  {-# INLINABLE words #-}
  
  -- | Intersperse a 'Char' in between the characters of the text stream
  {-# INLINABLE words #-}
  
  -- | Intersperse a 'Char' in between the characters of the text stream