4 files changed, 255 insertions, 228 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 71b1316..9ed0d78 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -181,7 +181,7 @@ import qualified GHC.IO.Exception as G
 import Pipes
 import qualified Pipes.ByteString as PB
 import qualified Pipes.Text.Internal as PE
-import Pipes.Text.Internal (Codec(..))
+import Pipes.Text.Codec (Codec(..))
 import Pipes.Core (respond, Server')
 import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
 import qualified Pipes.Group as PG
@@ -243,8 +243,8 @@ stdin = fromHandle IO.stdin
 fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
 fromHandle h =  go where
      go = do txt <- liftIO (T.hGetChunk h)
-              unless (T.null txt) $ do yield txt
+              unless (T.null txt) ( do yield txt
-                                       go
+                                       go )
 {-# INLINABLE fromHandle#-}
@@ -258,7 +258,9 @@ readFile :: MonadSafe m => FilePath -> Producer Text m ()
 readFile file = Safe.withFile file IO.ReadMode fromHandle
 {-# INLINE readFile #-}
-{-| Stream lines of text from stdin (for testing in ghci etc.) 
+{-| Crudely stream lines of input from stdin in the style of Pipes.Prelude. 
+    This is for testing in ghci etc.; obviously it will be unsound if used to recieve
+    the contents of immense files with few newlines.
 >>> let safely = runSafeT . runEffect
 >>> safely $ for Text.stdinLn (lift . lift . print . T.length)
@@ -282,8 +284,8 @@ stdinLn = go where
    Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
-    Note: For best performance, use @(for source (liftIO . putStr))@ instead of
+    Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ 
-    @(source >-> stdout)@ in suitable cases.
+    instead of @(source >-> stdout)@ .
 -}
 stdout :: MonadIO m => Consumer' Text m ()
 stdout = go
@@ -704,11 +706,8 @@ isEndOfChars = do
 {-# INLINABLE isEndOfChars #-}
+-- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated
+-- stream of Text ends by returning a stream of ByteStrings beginning at the point of failure.
-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded into a Pipe of Text
-- returning a Pipe of ByteStrings that begins at the point of failure.
 decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) 
                               (Producer Text m (Producer ByteString m r))
@@ -716,10 +715,10 @@ decodeUtf8 k p0 = fmap (\p -> join  (for p (yield . TE.encodeUtf8)))
                       (k (go B.empty PE.streamDecodeUtf8 p0)) where
  go !carry dec0 p = do 
     x <- lift (next p) 
-     case x of Left r -> if B.null carry 
+     case x of Left r -> return (if B.null carry 
-                           then return (return r)      -- all bytestrinput was consumed
+                                    then return r -- all bytestring input was consumed
-                           else return (do yield carry -- a potentially valid fragment remains
+                                    else (do yield carry -- a potentially valid fragment remains
-                                           return r)
+                                             return r))
                                           
               Right (chunk, p') -> case dec0 chunk of 
                   PE.Some text carry2 dec -> do yield text
diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs
new file mode 100644
index 0000000..e4357b9
--- /dev/null
+++ b/Pipes/Text/Codec.hs
@@ -0,0 +1,240 @@
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
+-- |
+-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
+-- License: MIT
+--
+-- Handle streams of text.
+--
+-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
+module Pipes.Text.Codec
+    ( Decoding(..)
+    , streamDecodeUtf8
+    , decodeSomeUtf8
+    , Codec(..)
+    , TextException(..)
+    , utf8
+    ) where
+import Data.Bits ((.&.))
+import Data.Char (ord)
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.Encoding.Error ()
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+import Pipes.Text.Internal
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+  { codecName :: Text
+  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+  , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+  }
+instance Show Codec where
+    showsPrec d c = showParen (d > 10) $ 
+                    showString "Codec " . shows (codecName c)
+data TextException = DecodeException Codec Word8
+                   | EncodeException Codec Char
+                   | LengthExceeded Int
+                   | TextException Exc.SomeException
+    deriving (Show, Typeable)
+instance Exc.Exception TextException
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+           -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+  loop extra bs0 = case op (B.append extra bs0) of
+                    (txt, Right bs) -> Some txt bs (loop bs)
+                    (txt, Left (_,bs)) -> Other txt bs
+splitSlowly :: (ByteString -> Text)
+            -> ByteString 
+            -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+    splits 0 = [(B.empty, bytes)]
+    splits n = B.splitAt n bytes : splits (n - 1)
+    decFirst (a, b) = case tryEvaluate (dec a) of
+        Left _ -> Nothing
+        Right text -> let trouble = case tryEvaluate (dec b) of
+                            Left exc -> Left (TextException exc, b)
+                            Right _  -> Right B.empty 
+                      in Just (text, trouble) -- this case shouldn't occur, 
+                                      -- since splitSlowly is only called
+                                      -- when parsing failed somewhere
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-8"
+    enc text = (TE.encodeUtf8 text, Nothing)
+    dec bytes = case decodeSomeUtf8 bytes of 
+      (t,b) -> (t, Right b)
+--     -- Whether the given byte is a continuation byte.
+--     isContinuation byte = byte .&. 0xC0 == 0x80
+-- 
+--     -- The number of continuation bytes needed by the given
+--     -- non-continuation byte. Returns -1 for an illegal UTF-8
+--     -- non-continuation byte and the whole split quickly must fail so
+--     -- as the input is passed to TE.decodeUtf8, which will issue a
+--     -- suitable error.
+--     required x0
+--         | x0 .&. 0x80 == 0x00 = 0
+--         | x0 .&. 0xE0 == 0xC0 = 1
+--         | x0 .&. 0xF0 == 0xE0 = 2
+--         | x0 .&. 0xF8 == 0xF0 = 3
+--         | otherwise           = -1
+-- 
+--     splitQuickly bytes
+--         | B.null l || req == -1 = Nothing
+--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+--         | otherwise = Just (TE.decodeUtf8 l', r')
+--       where
+--         (l, r) = B.spanEnd isContinuation bytes
+--         req = required (B.last l)
+--         l' = B.init l
+--         r' = B.cons (B.last l) r
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-LE"
+    enc text = (TE.encodeUtf16LE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16LE bytes
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes n)
+                (B.index bytes (n + 1))
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-BE"
+    enc text = (TE.encodeUtf16BE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16BE bytes
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes (n + 1))
+                (B.index bytes n)
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+    x :: Word16
+    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-LE"
+    enc text = (TE.encodeUtf32LE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32LE bs
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-BE"
+    enc text = (TE.encodeUtf32BE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32BE bs
+utf32SplitBytes :: (ByteString -> Text)
+                -> ByteString
+                -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+    split = maybeDecode (dec toDecode, extra)
+    len = B.length bytes
+    lenExtra = mod len 4
+    lenToDecode = len - lenExtra
+    (toDecode, extra) = if lenExtra == 0
+        then (bytes, B.empty)
+        else B.splitAt lenToDecode bytes
+ascii :: Codec
+ascii = Codec name enc (toDecoding dec) where
+    name = T.pack "ASCII"
+    enc text = (bytes, extra) where
+        (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
+        bytes = B8.pack (T.unpack safe)
+        extra = if T.null unsafe
+            then Nothing
+            else Just (EncodeException ascii (T.head unsafe), unsafe)
+    dec bytes = (text, extra) where
+        (safe, unsafe) = B.span (<= 0x7F) bytes
+        text = T.pack (B8.unpack safe)
+        extra = if B.null unsafe
+            then Right B.empty
+            else Left (DecodeException ascii (B.head unsafe), unsafe)
+iso8859_1 :: Codec
+iso8859_1 = Codec name enc (toDecoding dec) where
+    name = T.pack "ISO-8859-1"
+    enc text = (bytes, extra) where
+        (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
+        bytes = B8.pack (T.unpack safe)
+        extra = if T.null unsafe
+            then Nothing
+            else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
+    dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+    Left _ -> Nothing
+    Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 76c2f4f..bcee278 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -9,9 +9,6 @@ module Pipes.Text.Internal
    ( Decoding(..)
    , streamDecodeUtf8
    , decodeSomeUtf8
-    , Codec(..)
-    , TextException(..)
-    , utf8
    ) where
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Control.Monad.ST (ST, runST)
@@ -43,215 +40,6 @@ import Data.Maybe (catMaybes)
 #include "pipes_text_cbits.h"
-- | A specific character encoding.
--
-- Since 0.3.0
-data Codec = Codec
-  { codecName :: Text
-  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
-  , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
-  }
-instance Show Codec where
-    showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
-- Since 0.3.0
-data TextException = DecodeException Codec Word8
-                   | EncodeException Codec Char
-                   | LengthExceeded Int
-                   | TextException Exc.SomeException
-    deriving (Show, Typeable)
-instance Exc.Exception TextException
-toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
-           -> (ByteString -> Decoding)
-toDecoding op = loop B.empty where
-  loop extra bs0 = case op (B.append extra bs0) of
-                    (txt, Right bs) -> Some txt bs (loop bs)
-                    (txt, Left (_,bs)) -> Other txt bs
-splitSlowly :: (ByteString -> Text)
-            -> ByteString 
-            -> (Text, Either (TextException, ByteString) ByteString)
-splitSlowly dec bytes = valid where
-    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
-    splits 0 = [(B.empty, bytes)]
-    splits n = B.splitAt n bytes : splits (n - 1)
-    decFirst (a, b) = case tryEvaluate (dec a) of
-        Left _ -> Nothing
-        Right text -> let trouble = case tryEvaluate (dec b) of
-                            Left exc -> Left (TextException exc, b)
-                            Right _  -> Right B.empty 
-                      in Just (text, trouble)
-                                      -- this case shouldn't occur, 
-                                      -- since splitSlowly is only called
-                                      -- when parsing failed somewhere
-utf8 :: Codec
-utf8 = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-8"
-    enc text = (TE.encodeUtf8 text, Nothing)
-    dec bytes = case decodeSomeUtf8 bytes of 
-      (t,b) -> (t, Right b)
--     -- Whether the given byte is a continuation byte.
--     isContinuation byte = byte .&. 0xC0 == 0x80
-- 
--     -- The number of continuation bytes needed by the given
--     -- non-continuation byte. Returns -1 for an illegal UTF-8
--     -- non-continuation byte and the whole split quickly must fail so
--     -- as the input is passed to TE.decodeUtf8, which will issue a
--     -- suitable error.
--     required x0
--         | x0 .&. 0x80 == 0x00 = 0
--         | x0 .&. 0xE0 == 0xC0 = 1
--         | x0 .&. 0xF0 == 0xE0 = 2
--         | x0 .&. 0xF8 == 0xF0 = 3
--         | otherwise           = -1
-- 
--     splitQuickly bytes
--         | B.null l || req == -1 = Nothing
--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
--         | otherwise = Just (TE.decodeUtf8 l', r')
--       where
--         (l, r) = B.spanEnd isContinuation bytes
--         req = required (B.last l)
--         l' = B.init l
--         r' = B.cons (B.last l) r
-- |
-- Since 0.3.0
-utf16_le :: Codec
-utf16_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-LE"
-    enc text = (TE.encodeUtf16LE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16LE bytes
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes n)
-                (B.index bytes (n + 1))
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
-- |
-- Since 0.3.0
-utf16_be :: Codec
-utf16_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-BE"
-    enc text = (TE.encodeUtf16BE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16BE bytes
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes (n + 1))
-                (B.index bytes n)
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
-utf16Required :: Word8 -> Word8 -> Int
-utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
-    x :: Word16
-    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
-- |
-- Since 0.3.0
-utf32_le :: Codec
-utf32_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-LE"
-    enc text = (TE.encodeUtf32LE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32LE bs
-- |
-- Since 0.3.0
-utf32_be :: Codec
-utf32_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-BE"
-    enc text = (TE.encodeUtf32BE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32BE bs
-utf32SplitBytes :: (ByteString -> Text)
-                -> ByteString
-                -> Maybe (Text, ByteString)
-utf32SplitBytes dec bytes = split where
-    split = maybeDecode (dec toDecode, extra)
-    len = B.length bytes
-    lenExtra = mod len 4
-    lenToDecode = len - lenExtra
-    (toDecode, extra) = if lenExtra == 0
-        then (bytes, B.empty)
-        else B.splitAt lenToDecode bytes
-- |
-- Since 0.3.0
-ascii :: Codec
-ascii = Codec name enc (toDecoding dec) where
-    name = T.pack "ASCII"
-    enc text = (bytes, extra) where
-        (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
-        bytes = B8.pack (T.unpack safe)
-        extra = if T.null unsafe
-            then Nothing
-            else Just (EncodeException ascii (T.head unsafe), unsafe)
-    dec bytes = (text, extra) where
-        (safe, unsafe) = B.span (<= 0x7F) bytes
-        text = T.pack (B8.unpack safe)
-        extra = if B.null unsafe
-            then Right B.empty
-            else Left (DecodeException ascii (B.head unsafe), unsafe)
-- |
-- Since 0.3.0
-iso8859_1 :: Codec
-iso8859_1 = Codec name enc (toDecoding dec) where
-    name = T.pack "ISO-8859-1"
-    enc text = (bytes, extra) where
-        (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
-        bytes = B8.pack (T.unpack safe)
-        extra = if T.null unsafe
-            then Nothing
-            else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
-    dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
-tryEvaluate :: a -> Either Exc.SomeException a
-tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
-maybeDecode :: (a, b) -> Maybe (a, b)
-maybeDecode (a, b) = case tryEvaluate a of
-    Left _ -> Nothing
-    Right _ -> Just (a, b)
 -- | A stream oriented decoding result.
 data Decoding = Some Text ByteString (ByteString -> Decoding)
diff --git a/pipes-text.cabal b/pipes-text.cabal
index 33cbab4..44bc551 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -14,7 +14,7 @@ cabal-version:       >=1.10
 library
  c-sources:    cbits/cbits.c
  include-dirs: include
-  exposed-modules:     Pipes.Text, Pipes.Text.Internal
+  exposed-modules:     Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec
  -- other-modules:       
  other-extensions:    RankNTypes
  build-depends:       base         >= 4       && < 5  ,

diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 71b1316..9ed0d78 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs
@@ -181,7 +181,7 @@ import qualified GHC.IO.Exception as G
181	import Pipes	181	import Pipes
182	import qualified Pipes.ByteString as PB	182	import qualified Pipes.ByteString as PB
183	import qualified Pipes.Text.Internal as PE	183	import qualified Pipes.Text.Internal as PE
184	import Pipes.Text.Internal (Codec(..))	184	import Pipes.Text.Codec (Codec(..))
185	import Pipes.Core (respond, Server')	185	import Pipes.Core (respond, Server')
186	import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))	186	import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
187	import qualified Pipes.Group as PG	187	import qualified Pipes.Group as PG
@@ -243,8 +243,8 @@ stdin = fromHandle IO.stdin
243	fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()	243	fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
244	fromHandle h = go where	244	fromHandle h = go where
245	go = do txt <- liftIO (T.hGetChunk h)	245	go = do txt <- liftIO (T.hGetChunk h)
246	unless (T.null txt) $ do yield txt	246	unless (T.null txt) ( do yield txt
247	go	247	go )
248	{-# INLINABLE fromHandle#-}	248	{-# INLINABLE fromHandle#-}
249		249
250		250
@@ -258,7 +258,9 @@ readFile :: MonadSafe m => FilePath -> Producer Text m ()
258	readFile file = Safe.withFile file IO.ReadMode fromHandle	258	readFile file = Safe.withFile file IO.ReadMode fromHandle
259	{-# INLINE readFile #-}	259	{-# INLINE readFile #-}
260		260
261	{-\| Stream lines of text from stdin (for testing in ghci etc.)	261	{-\| Crudely stream lines of input from stdin in the style of Pipes.Prelude.
		262	This is for testing in ghci etc.; obviously it will be unsound if used to recieve
		263	the contents of immense files with few newlines.
262		264
263	>>> let safely = runSafeT . runEffect	265	>>> let safely = runSafeT . runEffect
264	>>> safely $ for Text.stdinLn (lift . lift . print . T.length)	266	>>> safely $ for Text.stdinLn (lift . lift . print . T.length)
@@ -282,8 +284,8 @@ stdinLn = go where
282		284
283	Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.	285	Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
284		286
285	Note: For best performance, use @(for source (liftIO . putStr))@ instead of	287	Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@
286	@(source >-> stdout)@ in suitable cases.	288	instead of @(source >-> stdout)@ .
287	-}	289	-}
288	stdout :: MonadIO m => Consumer' Text m ()	290	stdout :: MonadIO m => Consumer' Text m ()
289	stdout = go	291	stdout = go
@@ -704,11 +706,8 @@ isEndOfChars = do
704	{-# INLINABLE isEndOfChars #-}	706	{-# INLINABLE isEndOfChars #-}
705		707
706		708
707		709	-- \| An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated
708		710	-- stream of Text ends by returning a stream of ByteStrings beginning at the point of failure.
709
710	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded into a Pipe of Text
711	-- returning a Pipe of ByteStrings that begins at the point of failure.
712		711
713	decodeUtf8 :: Monad m => Lens' (Producer ByteString m r)	712	decodeUtf8 :: Monad m => Lens' (Producer ByteString m r)
714	(Producer Text m (Producer ByteString m r))	713	(Producer Text m (Producer ByteString m r))
@@ -716,10 +715,10 @@ decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8)))
716	(k (go B.empty PE.streamDecodeUtf8 p0)) where	715	(k (go B.empty PE.streamDecodeUtf8 p0)) where
717	go !carry dec0 p = do	716	go !carry dec0 p = do
718	x <- lift (next p)	717	x <- lift (next p)
719	case x of Left r -> if B.null carry	718	case x of Left r -> return (if B.null carry
720	then return (return r) -- all bytestrinput was consumed	719	then return r -- all bytestring input was consumed
721	else return (do yield carry -- a potentially valid fragment remains	720	else (do yield carry -- a potentially valid fragment remains
722	return r)	721	return r))
723		722
724	Right (chunk, p') -> case dec0 chunk of	723	Right (chunk, p') -> case dec0 chunk of
725	PE.Some text carry2 dec -> do yield text	724	PE.Some text carry2 dec -> do yield text


diff --git a/Pipes/Text/Codec.hs b/Pipes/Text/Codec.hs new file mode 100644 index 0000000..e4357b9 --- /dev/null +++ b/Pipes/Text/Codec.hs
@@ -0,0 +1,240 @@
		1
		2	{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
		3	-- \|
		4	-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
		5	-- License: MIT
		6	--
		7	-- Handle streams of text.
		8	--
		9	-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
		10
		11	module Pipes.Text.Codec
		12	( Decoding(..)
		13	, streamDecodeUtf8
		14	, decodeSomeUtf8
		15	, Codec(..)
		16	, TextException(..)
		17	, utf8
		18	) where
		19
		20	import Data.Bits ((.&.))
		21	import Data.Char (ord)
		22	import Data.ByteString as B
		23	import Data.ByteString (ByteString)
		24	import Data.ByteString.Internal as B
		25	import Data.ByteString.Char8 as B8
		26	import Data.Text (Text)
		27	import qualified Data.Text as T
		28	import qualified Data.Text.Encoding as TE
		29	import Data.Text.Encoding.Error ()
		30	import GHC.Word (Word8, Word32)
		31	import qualified Data.Text.Array as A
		32	import Data.Word (Word8, Word16)
		33	import System.IO.Unsafe (unsafePerformIO)
		34	import qualified Control.Exception as Exc
		35	import Data.Bits ((.&.), (.\|.), shiftL)
		36	import Data.Typeable
		37	import Control.Arrow (first)
		38	import Data.Maybe (catMaybes)
		39	import Pipes.Text.Internal
		40
		41	-- \| A specific character encoding.
		42	--
		43	-- Since 0.3.0
		44	data Codec = Codec
		45	{ codecName :: Text
		46	, codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
		47	, codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
		48	}
		49
		50	instance Show Codec where
		51	showsPrec d c = showParen (d > 10) $
		52	showString "Codec " . shows (codecName c)
		53
		54	data TextException = DecodeException Codec Word8
		55	\| EncodeException Codec Char
		56	\| LengthExceeded Int
		57	\| TextException Exc.SomeException
		58	deriving (Show, Typeable)
		59	instance Exc.Exception TextException
		60
		61
		62	toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
		63	-> (ByteString -> Decoding)
		64	toDecoding op = loop B.empty where
		65	loop extra bs0 = case op (B.append extra bs0) of
		66	(txt, Right bs) -> Some txt bs (loop bs)
		67	(txt, Left (_,bs)) -> Other txt bs
		68
		69
		70	splitSlowly :: (ByteString -> Text)
		71	-> ByteString
		72	-> (Text, Either (TextException, ByteString) ByteString)
		73	splitSlowly dec bytes = valid where
		74	valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
		75	splits 0 = [(B.empty, bytes)]
		76	splits n = B.splitAt n bytes : splits (n - 1)
		77	decFirst (a, b) = case tryEvaluate (dec a) of
		78	Left _ -> Nothing
		79	Right text -> let trouble = case tryEvaluate (dec b) of
		80	Left exc -> Left (TextException exc, b)
		81	Right _ -> Right B.empty
		82	in Just (text, trouble) -- this case shouldn't occur,
		83	-- since splitSlowly is only called
		84	-- when parsing failed somewhere
		85
		86	utf8 :: Codec
		87	utf8 = Codec name enc (toDecoding dec) where
		88	name = T.pack "UTF-8"
		89	enc text = (TE.encodeUtf8 text, Nothing)
		90	dec bytes = case decodeSomeUtf8 bytes of
		91	(t,b) -> (t, Right b)
		92
		93	-- -- Whether the given byte is a continuation byte.
		94	-- isContinuation byte = byte .&. 0xC0 == 0x80
		95	--
		96	-- -- The number of continuation bytes needed by the given
		97	-- -- non-continuation byte. Returns -1 for an illegal UTF-8
		98	-- -- non-continuation byte and the whole split quickly must fail so
		99	-- -- as the input is passed to TE.decodeUtf8, which will issue a
		100	-- -- suitable error.
		101	-- required x0
		102	-- \| x0 .&. 0x80 == 0x00 = 0
		103	-- \| x0 .&. 0xE0 == 0xC0 = 1
		104	-- \| x0 .&. 0xF0 == 0xE0 = 2
		105	-- \| x0 .&. 0xF8 == 0xF0 = 3
		106	-- \| otherwise = -1
		107	--
		108	-- splitQuickly bytes
		109	-- \| B.null l \|\| req == -1 = Nothing
		110	-- \| req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
		111	-- \| otherwise = Just (TE.decodeUtf8 l', r')
		112	-- where
		113	-- (l, r) = B.spanEnd isContinuation bytes
		114	-- req = required (B.last l)
		115	-- l' = B.init l
		116	-- r' = B.cons (B.last l) r
		117
		118
		119	utf16_le :: Codec
		120	utf16_le = Codec name enc (toDecoding dec) where
		121	name = T.pack "UTF-16-LE"
		122	enc text = (TE.encodeUtf16LE text, Nothing)
		123	dec bytes = case splitQuickly bytes of
		124	Just (text, extra) -> (text, Right extra)
		125	Nothing -> splitSlowly TE.decodeUtf16LE bytes
		126
		127	splitQuickly bytes = maybeDecode (loop 0) where
		128	maxN = B.length bytes
		129
		130	loop n \| n == maxN = decodeAll
		131	\| (n + 1) == maxN = decodeTo n
		132	loop n = let
		133	req = utf16Required
		134	(B.index bytes n)
		135	(B.index bytes (n + 1))
		136	decodeMore = loop $! n + req
		137	in if n + req > maxN
		138	then decodeTo n
		139	else decodeMore
		140
		141	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
		142	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
		143
		144	utf16_be :: Codec
		145	utf16_be = Codec name enc (toDecoding dec) where
		146	name = T.pack "UTF-16-BE"
		147	enc text = (TE.encodeUtf16BE text, Nothing)
		148	dec bytes = case splitQuickly bytes of
		149	Just (text, extra) -> (text, Right extra)
		150	Nothing -> splitSlowly TE.decodeUtf16BE bytes
		151
		152	splitQuickly bytes = maybeDecode (loop 0) where
		153	maxN = B.length bytes
		154
		155	loop n \| n == maxN = decodeAll
		156	\| (n + 1) == maxN = decodeTo n
		157	loop n = let
		158	req = utf16Required
		159	(B.index bytes (n + 1))
		160	(B.index bytes n)
		161	decodeMore = loop $! n + req
		162	in if n + req > maxN
		163	then decodeTo n
		164	else decodeMore
		165
		166	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
		167	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
		168
		169	utf16Required :: Word8 -> Word8 -> Int
		170	utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
		171	x :: Word16
		172	x = (fromIntegral x1 `shiftL` 8) .\|. fromIntegral x0
		173
		174
		175	utf32_le :: Codec
		176	utf32_le = Codec name enc (toDecoding dec) where
		177	name = T.pack "UTF-32-LE"
		178	enc text = (TE.encodeUtf32LE text, Nothing)
		179	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
		180	Just (text, extra) -> (text, Right extra)
		181	Nothing -> splitSlowly TE.decodeUtf32LE bs
		182
		183
		184	utf32_be :: Codec
		185	utf32_be = Codec name enc (toDecoding dec) where
		186	name = T.pack "UTF-32-BE"
		187	enc text = (TE.encodeUtf32BE text, Nothing)
		188	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
		189	Just (text, extra) -> (text, Right extra)
		190	Nothing -> splitSlowly TE.decodeUtf32BE bs
		191
		192	utf32SplitBytes :: (ByteString -> Text)
		193	-> ByteString
		194	-> Maybe (Text, ByteString)
		195	utf32SplitBytes dec bytes = split where
		196	split = maybeDecode (dec toDecode, extra)
		197	len = B.length bytes
		198	lenExtra = mod len 4
		199
		200	lenToDecode = len - lenExtra
		201	(toDecode, extra) = if lenExtra == 0
		202	then (bytes, B.empty)
		203	else B.splitAt lenToDecode bytes
		204
		205	ascii :: Codec
		206	ascii = Codec name enc (toDecoding dec) where
		207	name = T.pack "ASCII"
		208	enc text = (bytes, extra) where
		209	(safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
		210	bytes = B8.pack (T.unpack safe)
		211	extra = if T.null unsafe
		212	then Nothing
		213	else Just (EncodeException ascii (T.head unsafe), unsafe)
		214
		215	dec bytes = (text, extra) where
		216	(safe, unsafe) = B.span (<= 0x7F) bytes
		217	text = T.pack (B8.unpack safe)
		218	extra = if B.null unsafe
		219	then Right B.empty
		220	else Left (DecodeException ascii (B.head unsafe), unsafe)
		221
		222	iso8859_1 :: Codec
		223	iso8859_1 = Codec name enc (toDecoding dec) where
		224	name = T.pack "ISO-8859-1"
		225	enc text = (bytes, extra) where
		226	(safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
		227	bytes = B8.pack (T.unpack safe)
		228	extra = if T.null unsafe
		229	then Nothing
		230	else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
		231
		232	dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
		233
		234	tryEvaluate :: a -> Either Exc.SomeException a
		235	tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
		236
		237	maybeDecode :: (a, b) -> Maybe (a, b)
		238	maybeDecode (a, b) = case tryEvaluate a of
		239	Left _ -> Nothing
		240	Right _ -> Just (a, b)


diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 76c2f4f..bcee278 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs
@@ -9,9 +9,6 @@ module Pipes.Text.Internal
9	( Decoding(..)	9	( Decoding(..)
10	, streamDecodeUtf8	10	, streamDecodeUtf8
11	, decodeSomeUtf8	11	, decodeSomeUtf8
12	, Codec(..)
13	, TextException(..)
14	, utf8
15	) where	12	) where
16	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)	13	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
17	import Control.Monad.ST (ST, runST)	14	import Control.Monad.ST (ST, runST)
@@ -43,215 +40,6 @@ import Data.Maybe (catMaybes)
43	#include "pipes_text_cbits.h"	40	#include "pipes_text_cbits.h"
44		41
45		42
46	-- \| A specific character encoding.
47	--
48	-- Since 0.3.0
49	data Codec = Codec
50	{ codecName :: Text
51	, codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52	, codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
53	}
54
55	instance Show Codec where
56	showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
57
58	-- Since 0.3.0
59	data TextException = DecodeException Codec Word8
60	\| EncodeException Codec Char
61	\| LengthExceeded Int
62	\| TextException Exc.SomeException
63	deriving (Show, Typeable)
64	instance Exc.Exception TextException
65
66	toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
67	-> (ByteString -> Decoding)
68	toDecoding op = loop B.empty where
69	loop extra bs0 = case op (B.append extra bs0) of
70	(txt, Right bs) -> Some txt bs (loop bs)
71	(txt, Left (_,bs)) -> Other txt bs
72
73
74	splitSlowly :: (ByteString -> Text)
75	-> ByteString
76	-> (Text, Either (TextException, ByteString) ByteString)
77	splitSlowly dec bytes = valid where
78	valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79	splits 0 = [(B.empty, bytes)]
80	splits n = B.splitAt n bytes : splits (n - 1)
81	decFirst (a, b) = case tryEvaluate (dec a) of
82	Left _ -> Nothing
83	Right text -> let trouble = case tryEvaluate (dec b) of
84	Left exc -> Left (TextException exc, b)
85	Right _ -> Right B.empty
86	in Just (text, trouble)
87	-- this case shouldn't occur,
88	-- since splitSlowly is only called
89	-- when parsing failed somewhere
90
91	utf8 :: Codec
92	utf8 = Codec name enc (toDecoding dec) where
93	name = T.pack "UTF-8"
94	enc text = (TE.encodeUtf8 text, Nothing)
95	dec bytes = case decodeSomeUtf8 bytes of
96	(t,b) -> (t, Right b)
97
98	-- -- Whether the given byte is a continuation byte.
99	-- isContinuation byte = byte .&. 0xC0 == 0x80
100	--
101	-- -- The number of continuation bytes needed by the given
102	-- -- non-continuation byte. Returns -1 for an illegal UTF-8
103	-- -- non-continuation byte and the whole split quickly must fail so
104	-- -- as the input is passed to TE.decodeUtf8, which will issue a
105	-- -- suitable error.
106	-- required x0
107	-- \| x0 .&. 0x80 == 0x00 = 0
108	-- \| x0 .&. 0xE0 == 0xC0 = 1
109	-- \| x0 .&. 0xF0 == 0xE0 = 2
110	-- \| x0 .&. 0xF8 == 0xF0 = 3
111	-- \| otherwise = -1
112	--
113	-- splitQuickly bytes
114	-- \| B.null l \|\| req == -1 = Nothing
115	-- \| req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
116	-- \| otherwise = Just (TE.decodeUtf8 l', r')
117	-- where
118	-- (l, r) = B.spanEnd isContinuation bytes
119	-- req = required (B.last l)
120	-- l' = B.init l
121	-- r' = B.cons (B.last l) r
122
123	-- \|
124	-- Since 0.3.0
125	utf16_le :: Codec
126	utf16_le = Codec name enc (toDecoding dec) where
127	name = T.pack "UTF-16-LE"
128	enc text = (TE.encodeUtf16LE text, Nothing)
129	dec bytes = case splitQuickly bytes of
130	Just (text, extra) -> (text, Right extra)
131	Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133	splitQuickly bytes = maybeDecode (loop 0) where
134	maxN = B.length bytes
135
136	loop n \| n == maxN = decodeAll
137	\| (n + 1) == maxN = decodeTo n
138	loop n = let
139	req = utf16Required
140	(B.index bytes n)
141	(B.index bytes (n + 1))
142	decodeMore = loop $! n + req
143	in if n + req > maxN
144	then decodeTo n
145	else decodeMore
146
147	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150	-- \|
151	-- Since 0.3.0
152	utf16_be :: Codec
153	utf16_be = Codec name enc (toDecoding dec) where
154	name = T.pack "UTF-16-BE"
155	enc text = (TE.encodeUtf16BE text, Nothing)
156	dec bytes = case splitQuickly bytes of
157	Just (text, extra) -> (text, Right extra)
158	Nothing -> splitSlowly TE.decodeUtf16BE bytes
159
160	splitQuickly bytes = maybeDecode (loop 0) where
161	maxN = B.length bytes
162
163	loop n \| n == maxN = decodeAll
164	\| (n + 1) == maxN = decodeTo n
165	loop n = let
166	req = utf16Required
167	(B.index bytes (n + 1))
168	(B.index bytes n)
169	decodeMore = loop $! n + req
170	in if n + req > maxN
171	then decodeTo n
172	else decodeMore
173
174	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
175	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
176
177	utf16Required :: Word8 -> Word8 -> Int
178	utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
179	x :: Word16
180	x = (fromIntegral x1 `shiftL` 8) .\|. fromIntegral x0
181
182	-- \|
183	-- Since 0.3.0
184	utf32_le :: Codec
185	utf32_le = Codec name enc (toDecoding dec) where
186	name = T.pack "UTF-32-LE"
187	enc text = (TE.encodeUtf32LE text, Nothing)
188	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
189	Just (text, extra) -> (text, Right extra)
190	Nothing -> splitSlowly TE.decodeUtf32LE bs
191
192	-- \|
193	-- Since 0.3.0
194	utf32_be :: Codec
195	utf32_be = Codec name enc (toDecoding dec) where
196	name = T.pack "UTF-32-BE"
197	enc text = (TE.encodeUtf32BE text, Nothing)
198	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
199	Just (text, extra) -> (text, Right extra)
200	Nothing -> splitSlowly TE.decodeUtf32BE bs
201
202	utf32SplitBytes :: (ByteString -> Text)
203	-> ByteString
204	-> Maybe (Text, ByteString)
205	utf32SplitBytes dec bytes = split where
206	split = maybeDecode (dec toDecode, extra)
207	len = B.length bytes
208	lenExtra = mod len 4
209
210	lenToDecode = len - lenExtra
211	(toDecode, extra) = if lenExtra == 0
212	then (bytes, B.empty)
213	else B.splitAt lenToDecode bytes
214
215	-- \|
216	-- Since 0.3.0
217	ascii :: Codec
218	ascii = Codec name enc (toDecoding dec) where
219	name = T.pack "ASCII"
220	enc text = (bytes, extra) where
221	(safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
222	bytes = B8.pack (T.unpack safe)
223	extra = if T.null unsafe
224	then Nothing
225	else Just (EncodeException ascii (T.head unsafe), unsafe)
226
227	dec bytes = (text, extra) where
228	(safe, unsafe) = B.span (<= 0x7F) bytes
229	text = T.pack (B8.unpack safe)
230	extra = if B.null unsafe
231	then Right B.empty
232	else Left (DecodeException ascii (B.head unsafe), unsafe)
233
234	-- \|
235	-- Since 0.3.0
236	iso8859_1 :: Codec
237	iso8859_1 = Codec name enc (toDecoding dec) where
238	name = T.pack "ISO-8859-1"
239	enc text = (bytes, extra) where
240	(safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
241	bytes = B8.pack (T.unpack safe)
242	extra = if T.null unsafe
243	then Nothing
244	else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
245
246	dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
247
248	tryEvaluate :: a -> Either Exc.SomeException a
249	tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
250
251	maybeDecode :: (a, b) -> Maybe (a, b)
252	maybeDecode (a, b) = case tryEvaluate a of
253	Left _ -> Nothing
254	Right _ -> Just (a, b)
255		43
256	-- \| A stream oriented decoding result.	44	-- \| A stream oriented decoding result.
257	data Decoding = Some Text ByteString (ByteString -> Decoding)	45	data Decoding = Some Text ByteString (ByteString -> Decoding)


diff --git a/pipes-text.cabal b/pipes-text.cabal index 33cbab4..44bc551 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal
@@ -14,7 +14,7 @@ cabal-version: >=1.10
14	library	14	library
15	c-sources: cbits/cbits.c	15	c-sources: cbits/cbits.c
16	include-dirs: include	16	include-dirs: include
17	exposed-modules: Pipes.Text, Pipes.Text.Internal	17	exposed-modules: Pipes.Text, Pipes.Text.Internal, Pipes.Text.Codec
18	-- other-modules:	18	-- other-modules:
19	other-extensions: RankNTypes	19	other-extensions: RankNTypes
20	build-depends: base >= 4 && < 5 ,	20	build-depends: base >= 4 && < 5 ,