use new text-stream-decoding

author michaelt <what_is_it_to_do_anything@yahoo.com>

Sat, 15 Feb 2014 15:01:48 +0000 (10:01 -0500)

committer michaelt <what_is_it_to_do_anything@yahoo.com>

Sat, 15 Feb 2014 15:01:48 +0000 (10:01 -0500)
author michaelt <what_is_it_to_do_anything@yahoo.com>
Sat, 15 Feb 2014 15:01:48 +0000 (10:01 -0500)
committer michaelt <what_is_it_to_do_anything@yahoo.com>
Sat, 15 Feb 2014 15:01:48 +0000 (10:01 -0500)
diff --git a/Pipes/Text.hs b/Pipes/Text.hs

index 4b2d2b04261f9dfe8d565b5b04acfc7be95c67c6..8221c01ff160336f29009ee67ccdb904d9bc8bd3 100644 (file)
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -10,7 +10,8 @@
      example, the following program copies a document from one file to another:
  
  > import Pipes
      example, the following program copies a document from one file to another:
  
  > import Pipes
-> import qualified Data.Text.Pipes as Text
+> import qualified Pipes.Text as Text
+> import qualified Pipes.Text.IO as Text
  > import System.IO
  >
  > main =
  > import System.IO
  >
  > main =
@@ -21,7 +22,8 @@
  To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe):
  
  > import Pipes
  To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe):
  
  > import Pipes
-> import qualified Data.Text.Pipes as Text
+> import qualified Pipes.Text as Text
+> import qualified Pipes.Text.IO as Text
  > import Pipes.Safe
  >
  > main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt"
  > import Pipes.Safe
  >
  > main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt"
@@ -61,14 +63,14 @@ To stream from files, the following is perhaps more Prelude-like (note that it u
  module Pipes.Text  (
      -- * Producers
        fromLazy
  module Pipes.Text  (
      -- * Producers
        fromLazy
-    , stdin
-    , fromHandle
-    , readFile
+    -- , stdin
+    -- , fromHandle
+    -- , readFile
  
      -- * Consumers
  
      -- * Consumers
-    , stdout
-    , toHandle
-    , writeFile
+    -- , stdout
+    -- , toHandle
+    -- , writeFile
  
      -- * Pipes
      , map
  
      -- * Pipes
      , map
@@ -79,7 +81,7 @@ module Pipes.Text  (
      , dropWhile
      , filter
      , scan
      , dropWhile
      , filter
      , scan
-    , encodeUtf8
+--    , encodeUtf8
      , pack
      , unpack
      , toCaseFold
      , pack
      , unpack
      , toCaseFold
@@ -120,22 +122,22 @@ module Pipes.Text  (
      , word
      , line
      
      , word
      , line
      
-    -- * Decoding Lenses 
-    , decodeUtf8
-    , codec
-    
-    -- * Codecs
-    , utf8
-    , utf16_le
-    , utf16_be
-    , utf32_le
-    , utf32_be
-    
-    -- * Other Decoding/Encoding Functions
-    , decodeIso8859_1
-    , decodeAscii
-    , encodeIso8859_1
-    , encodeAscii
+    -- -- * Decoding Lenses 
+    -- , decodeUtf8
+    -- , codec
+    -- 
+    -- -- * Codecs
+    -- , utf8
+    -- , utf16_le
+    -- , utf16_be
+    -- , utf32_le
+    -- , utf32_be
+    -- 
+    -- -- * Other Decoding/Encoding Functions
+    -- , decodeIso8859_1
+    -- , decodeAscii
+    -- , encodeIso8859_1
+    -- , encodeAscii
  
      -- * FreeT Splitters
      , chunksOf
  
      -- * FreeT Splitters
      , chunksOf
@@ -157,11 +159,9 @@ module Pipes.Text  (
  
     -- * Re-exports
      -- $reexports
  
     -- * Re-exports
      -- $reexports
-    , Decoding(..)
-    , streamDecodeUtf8
-    , decodeSomeUtf8
-    , Codec(..)
-    , TextException(..)
+    -- , DecodeResult(..)
+    -- , Codec
+    -- , TextException(..)
      , module Data.ByteString
      , module Data.Text
      , module Data.Profunctor
      , module Data.ByteString
      , module Data.Text
      , module Data.Profunctor
@@ -170,7 +170,6 @@ module Pipes.Text  (
      , module Pipes.Group
      ) where
  
      , module Pipes.Group
      ) where
  
-import Control.Exception (throwIO, try)
  import Control.Applicative ((<*)) 
  import Control.Monad (liftM, unless, join)
  import Control.Monad.Trans.State.Strict (StateT(..), modify)
  import Control.Applicative ((<*)) 
  import Control.Monad (liftM, unless, join)
  import Control.Monad.Trans.State.Strict (StateT(..), modify)
@@ -193,24 +192,20 @@ import Data.Functor.Identity (Identity)
  import Data.Profunctor (Profunctor)
  import qualified Data.Profunctor
  import qualified Data.List as List
  import Data.Profunctor (Profunctor)
  import qualified Data.Profunctor
  import qualified Data.List as List
-import Foreign.C.Error (Errno(Errno), ePIPE)
-import qualified GHC.IO.Exception as G
  import Pipes
  import qualified Pipes.ByteString as PB
  import Pipes
  import qualified Pipes.ByteString as PB
-import qualified Pipes.Text.Internal as PI
-import Pipes.Text.Internal 
+-- import Pipes.Text.Decoding
  import Pipes.Core (respond, Server')
  import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
  import qualified Pipes.Group as PG
  import qualified Pipes.Parse as PP
  import Pipes.Parse (Parser)
  import Pipes.Core (respond, Server')
  import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
  import qualified Pipes.Group as PG
  import qualified Pipes.Parse as PP
  import Pipes.Parse (Parser)
-import qualified Pipes.Safe.Prelude as Safe
-import qualified Pipes.Safe as Safe
-import Pipes.Safe (MonadSafe(..), Base(..))
+
  import qualified Pipes.Prelude as P
  import qualified System.IO as IO
  import Data.Char (isSpace)
  import Data.Word (Word8)
  import qualified Pipes.Prelude as P
  import qualified System.IO as IO
  import Data.Char (isSpace)
  import Data.Word (Word8)
+import Data.Text.StreamDecoding
  
  import Prelude hiding (
      all,
  
  import Prelude hiding (
      all,
@@ -246,78 +241,6 @@ fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()
  fromLazy  = foldrChunks (\e a -> yield e >> a) (return ()) 
  {-# INLINE fromLazy #-}
  
  fromLazy  = foldrChunks (\e a -> yield e >> a) (return ()) 
  {-# INLINE fromLazy #-}
  
--- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer Text m ()
-stdin = fromHandle IO.stdin
-{-# INLINE stdin #-}
-
-{-| Convert a 'IO.Handle' into a text stream using a text size 
-    determined by the good sense of the text library; note that this
-    is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
-    but uses the system encoding and has other `Data.Text.IO` features
--}
-
-fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
-fromHandle h =  go where
-      go = do txt <- liftIO (T.hGetChunk h)
-              unless (T.null txt) ( do yield txt
-                                       go )
-{-# INLINABLE fromHandle#-}
-
-
-{-| Stream text from a file in the simple fashion of @Data.Text.IO@ 
-
->>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
-MAIN = PUTSTRLN "HELLO WORLD"
--}
-
-readFile :: MonadSafe m => FilePath -> Producer Text m ()
-readFile file = Safe.withFile file IO.ReadMode fromHandle
-{-# INLINE readFile #-}
-
-
-{-| Stream text to 'stdout'
-
-    Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
-
-    Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ 
-    instead of @(source >-> stdout)@ .
--}
-stdout :: MonadIO m => Consumer' Text m ()
-stdout = go
-  where
-    go = do
-        txt <- await
-        x  <- liftIO $ try (T.putStr txt)
-        case x of
-            Left (G.IOError { G.ioe_type  = G.ResourceVanished
-                            , G.ioe_errno = Just ioe })
-                 | Errno ioe == ePIPE
-                     -> return ()
-            Left  e  -> liftIO (throwIO e)
-            Right () -> go
-{-# INLINABLE stdout #-}
-
-
-{-| Convert a text stream into a 'Handle'
-
-    Note: again, for best performance, where possible use 
-    @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@.
--}
-toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r
-toHandle h = for cat (liftIO . T.hPutStr h)
-{-# INLINABLE toHandle #-}
-
-{-# RULES "p >-> toHandle h" forall p h .
-        p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt))
-  #-}
-
-
--- | Stream text into a file. Uses @pipes-safe@.
-writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
-writeFile file = Safe.withFile file IO.WriteMode toHandle
-{-# INLINE writeFile #-}
-
  
  type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  
  
  type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
  
@@ -690,28 +613,6 @@ isEndOfChars = do
  {-# INLINABLE isEndOfChars #-}
  
  
  {-# INLINABLE isEndOfChars #-}
  
  
-{- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated
-   stream of Text ends by returning a stream of ByteStrings beginning at the point of failure. 
-   -}
-
-decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) 
-                               (Producer Text m (Producer ByteString m r))
-decodeUtf8 k p0 = fmap (\p -> join  (for p (yield . TE.encodeUtf8))) 
-                       (k (go B.empty PI.streamDecodeUtf8 p0)) where
-  go !carry dec0 p = do 
-     x <- lift (next p) 
-     case x of Left r -> return (if B.null carry 
-                                    then return r -- all bytestring input was consumed
-                                    else (do yield carry -- a potentially valid fragment remains
-                                             return r))
-                                           
-               Right (chunk, p') -> case dec0 chunk of 
-                   PI.Some text carry2 dec -> do yield text
-                                                 go carry2 dec p'
-                   PI.Other text bs -> do yield text 
-                                          return (do yield bs -- an invalid blob remains
-                                                     p')
-{-# INLINABLE decodeUtf8 #-}
  
  
  -- | Splits a 'Producer' after the given number of characters
  
  
  -- | Splits a 'Producer' after the given number of characters
@@ -1057,106 +958,4 @@ unwords = intercalate (yield $ T.singleton ' ')
      @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. 
  -}
  
      @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. 
  -}
  
-{- | Use a 'Codec' as a pipes-style 'Lens' into a byte stream; the available 'Codec' s are
-     'utf8', 'utf16_le', 'utf16_be', 'utf32_le', 'utf32_be' . The 'Codec' concept and the 
-     individual 'Codec' definitions follow the enumerator and conduit libraries. 
-     
-     Utf8 is handled differently in this library -- without the use of 'unsafePerformIO' &co 
-     to catch 'Text' exceptions; but the same 'mypipe ^. codec utf8' interface can be used.
-     'mypipe ^. decodeUtf8' should be the same, but has a somewhat more direct and thus perhaps
-     better implementation.  
-
-     -}
-codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
-codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) 
-                                     (k (decoder (dec B.empty) p0) ) where 
-  decoder :: Monad m => PI.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-  decoder !d p0 = case d of 
-      PI.Other txt bad      -> do yield txt
-                                  return (do yield bad
-                                             p0)
-      PI.Some txt extra dec -> do yield txt
-                                  x <- lift (next p0)
-                                  case x of Left r -> return (do yield extra
-                                                                 return r)
-                                            Right (chunk,p1) -> decoder (dec chunk) p1
-
-{- | ascii and latin encodings only represent a small fragment of 'Text'; thus we cannot
-     use the pipes 'Lens' style to work with them. Rather we simply define functions 
-     each way. 
-
-     'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
-     returning the rest of the 'Text' at the first non-ascii 'Char'
--}
-encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
-encodeAscii = go where
-  go p = do echunk <- lift (next p)
-            case echunk of 
-              Left r -> return (return r)
-              Right (chunk, p') -> 
-                 if T.null chunk 
-                   then go p'
-                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
-                        in do yield (B8.pack (T.unpack safe))
-                              if T.null unsafe
-                                then go p'
-                                else return $ do yield unsafe 
-                                                 p'
-{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
-     returning the rest of the 'Text' upon hitting any non-latin 'Char'
-   -}
-encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
-encodeIso8859_1 = go where
-  go p = do etxt <- lift (next p)
-            case etxt of 
-              Left r -> return (return r)
-              Right (txt, p') -> 
-                 if T.null txt 
-                   then go p'
-                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
-                        in do yield (B8.pack (T.unpack safe))
-                              if T.null unsafe
-                                then go p'
-                                else return $ do yield unsafe 
-                                                 p'
-
-{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
-     unused 'ByteString' upon hitting an un-ascii byte.
-   -}
-decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeAscii = go where
-  go p = do echunk <- lift (next p)
-            case echunk of 
-              Left r -> return (return r)
-              Right (chunk, p') -> 
-                 if B.null chunk 
-                   then go p'
-                   else let (safe, unsafe)  = B.span (<= 0x7F) chunk
-                        in do yield (T.pack (B8.unpack safe))
-                              if B.null unsafe
-                                then go p'
-                                else return $ do yield unsafe 
-                                                 p'
-
-{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
-     unused 'ByteString' upon hitting the rare un-latinizable byte.
-     -}
-decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeIso8859_1 = go where
-  go p = do echunk <- lift (next p)
-            case echunk of 
-              Left r -> return (return r)
-              Right (chunk, p') -> 
-                 if B.null chunk 
-                   then go p'
-                   else let (safe, unsafe)  = B.span (<= 0xFF) chunk
-                        in do yield (T.pack (B8.unpack safe))
-                              if B.null unsafe
-                                then go p'
-                                else return $ do yield unsafe 
-                                                 p'
-
-
-
-
-                                            
+
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs

new file mode 100644 (file)

index 0000000..2bb5807
--- /dev/null
+++ b/Pipes/Text/Encoding.hs
@@ -0,0 +1,205 @@
+
+{-# LANGUAGE RankNTypes, BangPatterns #-}
+-- |
+-- Copyright: 2014 Michael Thompson
+--
+-- This module uses the stream decoding functions from the text-stream-decoding package
+-- to define pipes decoding functions and lenses.
+
+module Pipes.Text.Encoding
+    ( DecodeResult (..)
+    , Codec
+    , decodeUtf8
+    , decodeUtf8Pure
+    , decodeUtf16LE
+    , decodeUtf16BE
+    , decodeUtf32LE
+    , decodeUtf32BE
+    , utf8
+    , utf8Pure
+    , utf16LE
+    , utf16BE
+    , utf32LE
+    , utf32BE
+    , encodeAscii
+    , decodeAscii
+    , encodeIso8859_1
+    , decodeIso8859_1
+    ) 
+    where
+
+import Data.Char (ord)
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.StreamDecoding
+import GHC.Word (Word8, Word32)
+import Data.Word (Word8, Word16)
+import Control.Monad
+import Pipes
+import Pipes.Core
+
+
+
+{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
+    They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
+    The stream of text they 'see' in a bytestream ends by returning the original byte stream 
+    beginning at the point of failure, or the empty bytestream with its return value.
+   -}
+type Codec  = forall f m r . (Functor f , Monad m ) => 
+     (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
+     -> Producer ByteString m r -> f (Producer ByteString m r )
+
+decodeStream :: Monad m 
+       => (B.ByteString -> DecodeResult) 
+       -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeStream = loop where
+  loop dec0 p = 
+    do x <- lift (next p) 
+       case x of Left r -> return (return r)
+                 Right (chunk, p') -> case dec0 chunk of 
+                    DecodeResultSuccess text dec -> do yield text
+                                                       loop dec p'
+                    DecodeResultFailure text bs -> do yield text 
+                                                      return (do yield bs 
+                                                                 p')
+{-# INLINABLE decodeStream#-}
+
+decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8 = decodeStream streamUtf8
+{-# INLINE decodeUtf8 #-}
+
+decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf8Pure = decodeStream streamUtf8Pure
+{-# INLINE decodeUtf8Pure #-}
+
+decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf16LE = decodeStream streamUtf16LE
+{-# INLINE decodeUtf16LE #-}
+
+decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf16BE = decodeStream streamUtf16BE
+{-# INLINE decodeUtf16BE #-}
+
+decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf32LE = decodeStream streamUtf32LE
+{-# INLINE decodeUtf32LE #-}
+
+decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeUtf32BE = decodeStream streamUtf32BE
+{-# INLINE decodeUtf32BE #-}
+
+mkCodec :: (forall r m . Monad m => 
+           Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
+        -> (Text -> ByteString)
+        -> Codec
+mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc)))  (k (dec p0))
+
+
+{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
+   text stream ends by returning the original bytestream beginning at the point of failure,
+   or the empty bytestring for a well-encoded text. 
+   -}
+
+utf8 :: Codec
+utf8 = mkCodec decodeUtf8 TE.encodeUtf8
+
+utf8Pure :: Codec
+utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
+
+utf16LE :: Codec
+utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
+
+utf16BE :: Codec
+utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
+
+utf32LE :: Codec
+utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
+
+utf32BE :: Codec
+utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
+
+
+{- | ascii and latin encodings only use a small number of the characters 'Text'
+     recognizes; thus we cannot use the pipes 'Lens' style to work with them. 
+     Rather we simply define functions each way. 
+
+     'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
+     returning the rest of the 'Text' at the first non-ascii 'Char'
+-}
+
+encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
+encodeAscii = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (chunk, p') -> 
+                 if T.null chunk 
+                   then go p'
+                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0x7F) chunk
+                        in do yield (B8.pack (T.unpack safe))
+                              if T.null unsafe
+                                then go p'
+                                else return $ do yield unsafe 
+                                                 p'
+                                                 
+{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
+     returning the rest of the 'Text' upon hitting any non-latin 'Char'
+   -}
+encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
+encodeIso8859_1 = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (txt, p') -> 
+                 if T.null txt 
+                   then go p'
+                   else let (safe, unsafe)  = T.span (\c -> ord c <= 0xFF) txt
+                        in do yield (B8.pack (T.unpack safe))
+                              if T.null unsafe
+                                then go p'
+                                else return $ do yield unsafe 
+                                                 p'
+
+{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
+     unused 'ByteString' upon hitting an un-ascii byte.
+   -}
+decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeAscii = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (chunk, p') -> 
+                 if B.null chunk 
+                   then go p'
+                   else let (safe, unsafe) = B.span (<= 0x7F) chunk
+                        in do yield (T.pack (B8.unpack safe))
+                              if B.null unsafe
+                                then go p'
+                                else return (do yield unsafe 
+                                                p')
+
+{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
+     unused 'ByteString' upon hitting the rare un-latinizable byte.
+     -}
+decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
+decodeIso8859_1 = go where
+  go p = do e <- lift (next p)
+            case e of 
+              Left r -> return (return r)
+              Right (chunk, p') -> 
+                 if B.null chunk 
+                    then go p'
+                    else do let (safe, unsafe) = B.span (<= 0xFF) chunk
+                            yield (T.pack (B8.unpack safe))
+                            if B.null unsafe 
+                               then go p'
+                               else return (do yield unsafe 
+                                               p')
+
+
+
diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs

new file mode 100644 (file)

index 0000000..3c9ac98
--- /dev/null
+++ b/Pipes/Text/IO.hs
@@ -0,0 +1,96 @@
+{-#LANGUAGE RankNTypes#-}
+
+module Pipes.Text.IO 
+   ( stdin
+   , stdout
+   , fromHandle
+   , toHandle
+   , readFile
+   , writeFile
+   ) where
+
+import qualified System.IO as IO
+import Control.Exception (throwIO, try)
+import Foreign.C.Error (Errno(Errno), ePIPE)
+import qualified GHC.IO.Exception as G
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import Pipes
+import qualified Pipes.Safe.Prelude as Safe
+import qualified Pipes.Safe as Safe
+import Pipes.Safe (MonadSafe(..), Base(..))
+import Prelude hiding (readFile, writeFile)
+
+-- | Stream text from 'stdin'
+stdin :: MonadIO m => Producer Text m ()
+stdin = fromHandle IO.stdin
+{-# INLINE stdin #-}
+
+{-| Convert a 'IO.Handle' into a text stream using a text size 
+    determined by the good sense of the text library; note that this
+    is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
+    but uses the system encoding and has other `Data.Text.IO` features
+-}
+
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
+fromHandle h =  go where
+      go = do txt <- liftIO (T.hGetChunk h)
+              if T.null txt then return ()
+                            else do yield txt
+                                    go 
+{-# INLINABLE fromHandle#-}
+
+
+{-| Stream text from a file in the simple fashion of @Data.Text.IO@ 
+
+>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
+MAIN = PUTSTRLN "HELLO WORLD"
+-}
+
+readFile :: MonadSafe m => FilePath -> Producer Text m ()
+readFile file = Safe.withFile file IO.ReadMode fromHandle
+{-# INLINE readFile #-}
+
+
+{-| Stream text to 'stdout'
+
+    Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
+
+    Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ 
+    instead of @(source >-> stdout)@ .
+-}
+stdout :: MonadIO m => Consumer' Text m ()
+stdout = go
+  where
+    go = do
+        txt <- await
+        x  <- liftIO $ try (T.putStr txt)
+        case x of
+            Left (G.IOError { G.ioe_type  = G.ResourceVanished
+                            , G.ioe_errno = Just ioe })
+                 | Errno ioe == ePIPE
+                     -> return ()
+            Left  e  -> liftIO (throwIO e)
+            Right () -> go
+{-# INLINABLE stdout #-}
+
+
+{-| Convert a text stream into a 'Handle'
+
+    Note: again, for best performance, where possible use 
+    @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@.
+-}
+toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r
+toHandle h = for cat (liftIO . T.hPutStr h)
+{-# INLINABLE toHandle #-}
+
+{-# RULES "p >-> toHandle h" forall p h .
+        p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt))
+  #-}
+
+
+-- | Stream text into a file. Uses @pipes-safe@.
+writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
+writeFile file = Safe.withFile file IO.WriteMode toHandle
+{-# INLINE writeFile #-}
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs

deleted file mode 100644 (file)

index 582ef14..0000000
--- a/Pipes/Text/Internal.hs
+++ /dev/null
@@ -1,7 +0,0 @@
-module Pipes.Text.Internal
-    (module Pipes.Text.Internal.Codec
-    , module Pipes.Text.Internal.Decoding
-    ) where
-
-import Pipes.Text.Internal.Codec
-import Pipes.Text.Internal.Decoding
-\ No newline at end of file
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs

deleted file mode 100644 (file)

index 075a152..0000000
--- a/Pipes/Text/Internal/Codec.hs
+++ /dev/null
@@ -1,216 +0,0 @@
-
-{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
--- |
--- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
--- License: MIT
---  This Parts of this code were taken from enumerator and conduits, and adapted for pipes
-
--- This module follows the model of the enumerator and conduits libraries, and defines
--- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and 
--- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
--- special functions appear in @Pipes.Text@
-
-
-module Pipes.Text.Internal.Codec
-    ( Codec(..)
-    , TextException(..)
-    , utf8
-    , utf16_le
-    , utf16_be
-    , utf32_le
-    , utf32_be
-    ) where
-
-import Data.Bits ((.&.))
-import Data.Char (ord)
-import Data.ByteString as B 
-import Data.ByteString (ByteString)
-import Data.ByteString.Internal as B 
-import Data.ByteString.Char8 as B8
-import Data.Text (Text)
-import qualified Data.Text as T 
-import qualified Data.Text.Encoding as TE 
-import Data.Text.Encoding.Error ()
-import GHC.Word (Word8, Word32)
-import qualified Data.Text.Array as A
-import Data.Word (Word8, Word16)
-import System.IO.Unsafe (unsafePerformIO)
-import qualified Control.Exception as Exc
-import Data.Bits ((.&.), (.|.), shiftL)
-import Data.Typeable
-import Control.Arrow (first)
-import Data.Maybe (catMaybes)
-import Pipes.Text.Internal.Decoding
-import Pipes
--- | A specific character encoding.
-
-data Codec = Codec
-  { codecName :: Text
-  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
-  , codecDecode :: ByteString -> Decoding 
-  }
-
-instance Show Codec where
-    showsPrec d c = showParen (d > 10) $ 
-                    showString "Codec " . shows (codecName c)
-
-data TextException = DecodeException Codec Word8
-                   | EncodeException Codec Char
-                   | LengthExceeded Int
-                   | TextException Exc.SomeException
-    deriving (Show, Typeable)
-instance Exc.Exception TextException
-
-
-toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
-           -> (ByteString -> Decoding)
-toDecoding op = loop B.empty where
-  loop !extra bs0 = case op (B.append extra bs0) of
-                      (txt, Right bs) -> Some txt bs (loop bs)
-                      (txt, Left (_,bs)) -> Other txt bs
--- To do: toDecoding should be inlined in each of the 'Codec' definitions
--- or else Codec changed to the conduit/enumerator definition.  We have
--- altered it to use 'streamDecodeUtf8'
-
-splitSlowly :: (ByteString -> Text)
-            -> ByteString 
-            -> (Text, Either (TextException, ByteString) ByteString)
-splitSlowly dec bytes = valid where
-    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
-    splits 0 = [(B.empty, bytes)]
-    splits n = B.splitAt n bytes : splits (n - 1)
-    decFirst (a, b) = case tryEvaluate (dec a) of
-        Left _ -> Nothing
-        Right text -> let trouble = case tryEvaluate (dec b) of
-                            Left exc -> Left (TextException exc, b)
-                            Right _  -> Right B.empty 
-                      in Just (text, trouble) -- this case shouldn't occur, 
-                                      -- since splitSlowly is only called
-                                      -- when parsing failed somewhere
-
-utf8 :: Codec
-utf8 = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-8"
-    enc text = (TE.encodeUtf8 text, Nothing)
-    dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
-
---     -- Whether the given byte is a continuation byte.
---     isContinuation byte = byte .&. 0xC0 == 0x80
--- 
---     -- The number of continuation bytes needed by the given
---     -- non-continuation byte. Returns -1 for an illegal UTF-8
---     -- non-continuation byte and the whole split quickly must fail so
---     -- as the input is passed to TE.decodeUtf8, which will issue a
---     -- suitable error.
---     required x0
---         | x0 .&. 0x80 == 0x00 = 0
---         | x0 .&. 0xE0 == 0xC0 = 1
---         | x0 .&. 0xF0 == 0xE0 = 2
---         | x0 .&. 0xF8 == 0xF0 = 3
---         | otherwise           = -1
--- 
---     splitQuickly bytes
---         | B.null l || req == -1 = Nothing
---         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
---         | otherwise = Just (TE.decodeUtf8 l', r')
---       where
---         (l, r) = B.spanEnd isContinuation bytes
---         req = required (B.last l)
---         l' = B.init l
---         r' = B.cons (B.last l) r
-
-
-utf16_le :: Codec
-utf16_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-LE"
-    enc text = (TE.encodeUtf16LE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16LE bytes
-
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes n)
-                (B.index bytes (n + 1))
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-
-        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
-
-utf16_be :: Codec
-utf16_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-BE"
-    enc text = (TE.encodeUtf16BE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16BE bytes
-
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes (n + 1))
-                (B.index bytes n)
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-
-        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
-
-utf16Required :: Word8 -> Word8 -> Int
-utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
-    x :: Word16
-    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
-
-
-utf32_le :: Codec
-utf32_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-LE"
-    enc text = (TE.encodeUtf32LE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32LE bs
-
-
-utf32_be :: Codec
-utf32_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-BE"
-    enc text = (TE.encodeUtf32BE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32BE bs
-
-utf32SplitBytes :: (ByteString -> Text)
-                -> ByteString
-                -> Maybe (Text, ByteString)
-utf32SplitBytes dec bytes = split where
-    split = maybeDecode (dec toDecode, extra)
-    len = B.length bytes
-    lenExtra = mod len 4
-
-    lenToDecode = len - lenExtra
-    (toDecode, extra) = if lenExtra == 0
-        then (bytes, B.empty)
-        else B.splitAt lenToDecode bytes
-
-
-tryEvaluate :: a -> Either Exc.SomeException a
-tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
-
-maybeDecode :: (a, b) -> Maybe (a, b)
-maybeDecode (a, b) = case tryEvaluate a of
-    Left _ -> Nothing
-    Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs

deleted file mode 100644 (file)

index b5d928a..0000000
--- a/Pipes/Text/Internal/Decoding.hs
+++ /dev/null
@@ -1,154 +0,0 @@
-{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
-{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
-{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
-
--- This module lifts assorted materials from Brian O'Sullivan's text package 
--- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
--- 'streamDecodeUtf8'
-
-module Pipes.Text.Internal.Decoding 
-    ( Decoding(..)
-    , streamDecodeUtf8
-    , decodeSomeUtf8
-    ) where
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Control.Monad.ST (ST, runST)
-import Data.Bits ((.&.))
-import Data.ByteString as B 
-import Data.ByteString (ByteString)
-import Data.ByteString.Internal as B 
-import Data.ByteString.Char8 as B8
-import Data.Text (Text)
-import qualified Data.Text as T 
-import qualified Data.Text.Encoding as TE 
-import Data.Text.Encoding.Error ()
-import Data.Text.Internal (Text, textP)
-import Foreign.C.Types (CSize)
-import Foreign.ForeignPtr (withForeignPtr)
-import Foreign.Marshal.Utils (with)
-import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
-import Foreign.Storable (Storable, peek, poke)
-import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
-import GHC.Word (Word8, Word32)
-import qualified Data.Text.Array as A
-import Data.Word (Word8, Word16)
-import System.IO.Unsafe (unsafePerformIO)
-import qualified Control.Exception as Exc
-import Data.Bits ((.&.), (.|.), shiftL)
-import Data.Typeable
-import Control.Arrow (first)
-import Data.Maybe (catMaybes)
-#include "pipes_text_cbits.h"
-
-
-
---  A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
-
-data Decoding = Some Text ByteString (ByteString -> Decoding) 
-               -- Text, continuation and any undecoded fragment.
-              | Other Text ByteString  
-              --  Text followed by an undecodable ByteString
-              
-instance Show Decoding where
-    showsPrec d (Some t bs _) = showParen (d > prec) $
-                                showString "Some " . showsPrec prec' t .
-                                showChar ' ' . showsPrec prec' bs .
-                                showString " _"
-      where prec = 10; prec' = prec + 1
-    showsPrec d (Other t bs)  = showParen (d > prec) $
-                                showString "Other " . showsPrec prec' t .
-                                showChar ' ' . showsPrec prec' bs .
-                                showString " _"
-      where prec = 10; prec' = prec + 1
-
-newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
-newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
-
---  Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. 
-streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
-  where
-  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
-  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
-                    runST $ do marray <- A.new (len+1) 
-                               unsafeIOToST (decodeChunkToBuffer marray)
-     where
-     decodeChunkToBuffer :: A.MArray s -> IO Decoding
-     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-       with (0::CSize) $ \destOffPtr ->
-       with codepoint0 $ \codepointPtr ->
-       with state0     $ \statePtr ->
-       with nullPtr    $ \curPtrPtr ->
-         do let end = ptr `plusPtr` (off + len)
-                curPtr = ptr `plusPtr` off
-            poke curPtrPtr curPtr
-            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
-            state <- peek statePtr
-            lastPtr <- peek curPtrPtr
-            codepoint <- peek codepointPtr
-            n <- peek destOffPtr
-            chunkText <- mkText dest n
-            let left      = lastPtr `minusPtr` curPtr
-                remaining = B.drop left bs
-                accum = if T.null chunkText then B.append old remaining  else remaining 
-            return $! case state of 
-              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
-              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
-     {-# INLINE decodeChunkToBuffer #-}
-  {-# INLINE decodeChunkUtf8 #-}
-{-# INLINE streamDecodeUtf8 #-}
-
---  Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
-decodeSomeUtf8 :: ByteString -> (Text, ByteString)
-decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
-  dest <- A.new (len+1) 
-  unsafeIOToST $ 
-     withForeignPtr fp $ \ptr ->
-     with (0::CSize)        $ \destOffPtr ->
-     with (0::CodePoint)    $ \codepointPtr ->
-     with (0::DecoderState) $ \statePtr ->
-     with nullPtr           $ \curPtrPtr ->
-       do let end = ptr `plusPtr` (off + len)
-              curPtr = ptr `plusPtr` off
-          poke curPtrPtr curPtr
-          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
-                                   curPtrPtr end codepointPtr statePtr
-          state <- peek statePtr
-          lastPtr <- peek curPtrPtr
-          codepoint <- peek codepointPtr
-          n <- peek destOffPtr
-          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                                         return $! textP arr 0 (fromIntegral n)
-          let left      = lastPtr `minusPtr` curPtr
-              remaining = B.drop left bs
-          return $! (chunkText, remaining)
-{-# INLINE decodeSomeUtf8 #-}
-
-mkText :: A.MArray s -> CSize -> IO Text
-mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                                   return $! textP arr 0 (fromIntegral n)
-{-# INLINE mkText #-}
-
-ord :: Char -> Int
-ord (C# c#) = I# (ord# c#)
-{-# INLINE ord #-}
-
-unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
-unsafeWrite marr i c
-    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
-                       return 1
-    | otherwise   = do A.unsafeWrite marr i lo
-                       A.unsafeWrite marr (i+1) hi
-                       return 2
-    where n = ord c
-          m = n - 0x10000
-          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
-          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
-          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
-          {-# INLINE shiftR #-}
-{-# INLINE unsafeWrite #-}
-
-foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
-    :: MutableByteArray# s -> Ptr CSize
-    -> Ptr (Ptr Word8) -> Ptr Word8
-    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
-\ No newline at end of file
diff --git a/cbits/cbits.c b/cbits/cbits.c

deleted file mode 100644 (file)

index c11645b..0000000
--- a/cbits/cbits.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
- *
- * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
- *
- * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
- */
-
-#include <string.h>
-#include <stdint.h>
-#include <stdio.h>
-#include "pipes_text_cbits.h"
-
-
-
-#define UTF8_ACCEPT 0
-#define UTF8_REJECT 12
-
-static const uint8_t utf8d[] = {
-  /*
-   * The first part of the table maps bytes to character classes that
-   * to reduce the size of the transition table and create bitmasks.
-   */
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
-
-  /*
-   * The second part is a transition table that maps a combination of
-   * a state of the automaton and a character class to a state.
-   */
-   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
-  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
-  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
-  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
-  12,36,12,12,12,12,12,12,12,12,12,12,
-};
-
-static inline uint32_t
-decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
-  uint32_t type = utf8d[byte];
-
-  *codep = (*state != UTF8_ACCEPT) ?
-    (byte & 0x3fu) | (*codep << 6) :
-    (0xff >> type) & (byte);
-
-  return *state = utf8d[256 + *state + type];
-}
-
-/*
- * A best-effort decoder. Runs until it hits either end of input or
- * the start of an invalid byte sequence.
- *
- * At exit, we update *destoff with the next offset to write to, *src
- * with the next source location past the last one successfully
- * decoded, and return the next source location to read from.
- *
- * Moreover, we expose the internal decoder state (state0 and
- * codepoint0), allowing one to restart the decoder after it
- * terminates (say, due to a partial codepoint).
- *
- * In particular, there are a few possible outcomes,
- *
- *   1) We decoded the buffer entirely:
- *      In this case we return srcend
- *      state0 == UTF8_ACCEPT
- *
- *   2) We met an invalid encoding
- *      In this case we return the address of the first invalid byte
- *      state0 == UTF8_REJECT
- *
- *   3) We reached the end of the buffer while decoding a codepoint
- *      In this case we return a pointer to the first byte of the partial codepoint
- *      state0 != UTF8_ACCEPT, UTF8_REJECT
- *
- */
-
- #if defined(__GNUC__) || defined(__clang__)
- static inline uint8_t const *
- _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
-                        const uint8_t const **src, const uint8_t const *srcend,
-                        uint32_t *codepoint0, uint32_t *state0)
-   __attribute((always_inline));
- #endif
-
-static inline uint8_t const *
-_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
-                        const uint8_t const **src, const uint8_t const *srcend,
-                        uint32_t *codepoint0, uint32_t *state0)
-{
- uint16_t *d = dest + *destoff;
- const uint8_t *s = *src, *last = *src;
- uint32_t state = *state0;
- uint32_t codepoint = *codepoint0;
-
- while (s < srcend) {
-#if defined(__i386__) || defined(__x86_64__)
-   /*
-    * This code will only work on a little-endian system that
-    * supports unaligned loads.
-    *
-    * It gives a substantial speed win on data that is purely or
-    * partly ASCII (e.g. HTML), at only a slight cost on purely
-    * non-ASCII text.
-    */
-
-   if (state == UTF8_ACCEPT) {
-     while (s < srcend - 4) {
-       codepoint = *((uint32_t *) s);
-       if ((codepoint & 0x80808080) != 0)
-         break;
-       s += 4;
-
-       /*
-        * Tried 32-bit stores here, but the extra bit-twiddling
-        * slowed the code down.
-        */
-
-       *d++ = (uint16_t) (codepoint & 0xff);
-       *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
-       *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
-       *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
-     }
-     last = s;
-   }
-#endif
-
-   if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
-     if (state != UTF8_REJECT)
-       continue;
-     break;
-   }
-
-   if (codepoint <= 0xffff)
-     *d++ = (uint16_t) codepoint;
-   else {
-     *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
-     *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
-   }
-   last = s;
- }
-
- *destoff = d - dest;
- *codepoint0 = codepoint;
- *state0 = state;
- *src = last;
-
- return s;
-}
-
-uint8_t const *
-_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
-                          const uint8_t const **src,
-                          const uint8_t const *srcend,
-                          uint32_t *codepoint0, uint32_t *state0)
-{
- uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
-                                               codepoint0, state0);
- if (*state0 == UTF8_REJECT)
-   ret -=1;
- return ret;
-}
-
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h

deleted file mode 100644 (file)

index b9ab670..0000000
--- a/include/pipes_text_cbits.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
- */
-
-#ifndef _pipes_text_cbits_h
-#define _pipes_text_cbits_h
-
-#define UTF8_ACCEPT 0
-#define UTF8_REJECT 12
-
-#endif
diff --git a/pipes-text.cabal b/pipes-text.cabal

index a1d57bb3c914752e211c565d6db84ea62fba1394..017d41c3ddb719047a6fa039dd99d22bf4e86866 100644 (file)
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -1,5 +1,5 @@
  name:                pipes-text
  name:                pipes-text
-version:             0.0.0.6
+version:             0.0.0.7
  synopsis:            Text pipes.
  description:         Many of the pipes and other operations defined here mirror those in
                       the `pipes-bytestring` library. Folds like `length` and grouping 
  synopsis:            Text pipes.
  description:         Many of the pipes and other operations defined here mirror those in
                       the `pipes-bytestring` library. Folds like `length` and grouping 
@@ -36,23 +36,28 @@ source-repository head
      type: git
      location: https://github.com/michaelt/text-pipes
  
      type: git
      location: https://github.com/michaelt/text-pipes
  
+flag noio
+  default: False
+  Description: Use a version of text earlier than 0.11.3
  
  library
  
  library
-  c-sources:    cbits/cbits.c
-  include-dirs: include
-  exposed-modules:     Pipes.Text, Pipes.Text.Internal
-  other-modules:       Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec
-  other-extensions:    RankNTypes
-  build-depends:       base         >= 4       && < 5  ,
-                       bytestring >=0.10       && < 0.11,
-                       text >=0.11.3           && < 1.2,
-                       profunctors  >= 3.1.1   && < 4.1 ,
-                       pipes >=4.0             && < 4.2,
-                       pipes-group  >= 1.0.0   && < 1.1 ,
-                       pipes-parse >=2.0       && < 3.1,
+  exposed-modules:     Pipes.Text, Pipes.Text.Encoding
+  build-depends:       base >= 4                  && < 5  ,
+                       bytestring >= 0.9                  ,
+                       text >=0.11.3              && < 1.2,
+                       text-stream-decode >= 0.1  && < 0.2,  
+                       profunctors  >= 3.1.1      && < 4.1,
+                       pipes >=4.0                && < 4.2,
+                       pipes-group  >= 1.0.0      && < 1.1,
+                       pipes-parse >=2.0          && < 3.1,
                         pipes-safe, 
                         pipes-safe, 
-                       pipes-bytestring >= 1.0 && < 2.1,
-                       transformers >= 0.2.0.0 && < 0.4
-  -- hs-source-dirs:      
+                       pipes-bytestring >= 1.0    && < 2.1,
+                       transformers >= 0.2.0.0    && < 0.4
+  other-extensions:    RankNTypes
    default-language:    Haskell2010
    default-language:    Haskell2010
-  ghc-options: -O2 
+  ghc-options: -O2
+
+  if !flag(noio)
+    exposed-modules:   Pipes.Text.IO
+    build-depends:     text >=0.11.3              && < 1.2
+
author	michaelt <what_is_it_to_do_anything@yahoo.com>
	Sat, 15 Feb 2014 15:01:48 +0000 (10:01 -0500)
committer	michaelt <what_is_it_to_do_anything@yahoo.com>
	Sat, 15 Feb 2014 15:01:48 +0000 (10:01 -0500)
Pipes/Text.hs		patch \| blob \| blame \| history
Pipes/Text/Encoding.hs	[new file with mode: 0644]	patch \| blob
Pipes/Text/IO.hs	[new file with mode: 0644]	patch \| blob
Pipes/Text/Internal.hs	[deleted file]	patch \| blob \| blame \| history
Pipes/Text/Internal/Codec.hs	[deleted file]	patch \| blob \| blame \| history
Pipes/Text/Internal/Decoding.hs	[deleted file]	patch \| blob \| blame \| history
cbits/cbits.c	[deleted file]	patch \| blob \| blame \| history
include/pipes_text_cbits.h	[deleted file]	patch \| blob \| blame \| history
pipes-text.cabal		patch \| blob \| blame \| history