3 files changed, 365 insertions, 2 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 18ec8ec..0957a7d 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -167,6 +167,7 @@ module Pipes.Text  (
    , module Data.Word
    , module Pipes.Parse
    , module Pipes.Group
+    , module Pipes.Text.Internal.Codec
    ) where
 import Control.Exception (throwIO, try)
@@ -196,8 +197,8 @@ import Foreign.C.Error (Errno(Errno), ePIPE)
 import qualified GHC.IO.Exception as G
 import Pipes
 import qualified Pipes.ByteString as PB
-import qualified Pipes.Text.Internal as PE
+import qualified Pipes.Text.Internal.Decoding as PE
-import Pipes.Text.Codec 
+import Pipes.Text.Internal.Codec 
 import Pipes.Core (respond, Server')
 import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
 import qualified Pipes.Group as PG
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
new file mode 100644
index 0000000..4b9367f
--- /dev/null
+++ b/Pipes/Text/Internal/Codec.hs
@@ -0,0 +1,215 @@
+{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
+-- |
+-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
+-- License: MIT
+--
+-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
+module Pipes.Text.Internal.Codec
+    ( Decoding(..)
+    , streamDecodeUtf8
+    , decodeSomeUtf8
+    , Codec(..)
+    , TextException(..)
+    , utf8
+    , utf16_le
+    , utf16_be
+    , utf32_le
+    , utf32_be
+    ) where
+import Data.Bits ((.&.))
+import Data.Char (ord)
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.Encoding.Error ()
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+import Pipes.Text.Internal.Decoding
+import Pipes
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+  { codecName :: Text
+  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+  , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+  }
+instance Show Codec where
+    showsPrec d c = showParen (d > 10) $ 
+                    showString "Codec " . shows (codecName c)
+data TextException = DecodeException Codec Word8
+                   | EncodeException Codec Char
+                   | LengthExceeded Int
+                   | TextException Exc.SomeException
+    deriving (Show, Typeable)
+instance Exc.Exception TextException
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+           -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+  loop !extra bs0 = case op (B.append extra bs0) of
+                      (txt, Right bs) -> Some txt bs (loop bs)
+                      (txt, Left (_,bs)) -> Other txt bs
+-- To do: toDecoding should be inlined in each of the 'Codec' definitions
+-- or else Codec changed to the conduit/enumerator definition.  We have
+-- altered it to use 'streamDecodeUtf8'
+splitSlowly :: (ByteString -> Text)
+            -> ByteString 
+            -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+    splits 0 = [(B.empty, bytes)]
+    splits n = B.splitAt n bytes : splits (n - 1)
+    decFirst (a, b) = case tryEvaluate (dec a) of
+        Left _ -> Nothing
+        Right text -> let trouble = case tryEvaluate (dec b) of
+                            Left exc -> Left (TextException exc, b)
+                            Right _  -> Right B.empty 
+                      in Just (text, trouble) -- this case shouldn't occur, 
+                                      -- since splitSlowly is only called
+                                      -- when parsing failed somewhere
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-8"
+    enc text = (TE.encodeUtf8 text, Nothing)
+    dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
+--     -- Whether the given byte is a continuation byte.
+--     isContinuation byte = byte .&. 0xC0 == 0x80
+-- 
+--     -- The number of continuation bytes needed by the given
+--     -- non-continuation byte. Returns -1 for an illegal UTF-8
+--     -- non-continuation byte and the whole split quickly must fail so
+--     -- as the input is passed to TE.decodeUtf8, which will issue a
+--     -- suitable error.
+--     required x0
+--         | x0 .&. 0x80 == 0x00 = 0
+--         | x0 .&. 0xE0 == 0xC0 = 1
+--         | x0 .&. 0xF0 == 0xE0 = 2
+--         | x0 .&. 0xF8 == 0xF0 = 3
+--         | otherwise           = -1
+-- 
+--     splitQuickly bytes
+--         | B.null l || req == -1 = Nothing
+--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+--         | otherwise = Just (TE.decodeUtf8 l', r')
+--       where
+--         (l, r) = B.spanEnd isContinuation bytes
+--         req = required (B.last l)
+--         l' = B.init l
+--         r' = B.cons (B.last l) r
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-LE"
+    enc text = (TE.encodeUtf16LE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16LE bytes
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes n)
+                (B.index bytes (n + 1))
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-BE"
+    enc text = (TE.encodeUtf16BE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16BE bytes
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes (n + 1))
+                (B.index bytes n)
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+    x :: Word16
+    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-LE"
+    enc text = (TE.encodeUtf32LE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32LE bs
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-BE"
+    enc text = (TE.encodeUtf32BE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32BE bs
+utf32SplitBytes :: (ByteString -> Text)
+                -> ByteString
+                -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+    split = maybeDecode (dec toDecode, extra)
+    len = B.length bytes
+    lenExtra = mod len 4
+    lenToDecode = len - lenExtra
+    (toDecode, extra) = if lenExtra == 0
+        then (bytes, B.empty)
+        else B.splitAt lenToDecode bytes
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+    Left _ -> Nothing
+    Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
new file mode 100644
index 0000000..531104a
--- /dev/null
+++ b/Pipes/Text/Internal/Decoding.hs
@@ -0,0 +1,147 @@
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
+-- This module lifts assorted materials from Brian O'Sullivan's text package 
+-- especially Data.Text.Encoding in order to define a pipes-appropriate
+-- streamDecodeUtf8
+module Pipes.Text.Internal.Decoding 
+    ( Decoding(..)
+    , streamDecodeUtf8
+    , decodeSomeUtf8
+    ) where
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Control.Monad.ST (ST, runST)
+import Data.Bits ((.&.))
+import Data.ByteString as B 
+import Data.ByteString (ByteString)
+import Data.ByteString.Internal as B 
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
+import Data.Text.Encoding.Error ()
+import Data.Text.Internal (Text, textP)
+import Foreign.C.Types (CSize)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
+import GHC.Word (Word8, Word32)
+import qualified Data.Text.Array as A
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
+#include "pipes_text_cbits.h"
+-- | A stream oriented decoding result.
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+              | Other Text ByteString
+instance Show Decoding where
+    showsPrec d (Some t bs _) = showParen (d > prec) $
+                                showString "Some " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+    showsPrec d (Other t bs)  = showParen (d > prec) $
+                                showString "Other " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
+  where
+  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
+                    runST $ do marray <- A.new (len+1) 
+                               unsafeIOToST (decodeChunkToBuffer marray)
+     where
+     decodeChunkToBuffer :: A.MArray s -> IO Decoding
+     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+       with (0::CSize) $ \destOffPtr ->
+       with codepoint0 $ \codepointPtr ->
+       with state0     $ \statePtr ->
+       with nullPtr    $ \curPtrPtr ->
+         do let end = ptr `plusPtr` (off + len)
+                curPtr = ptr `plusPtr` off
+            poke curPtrPtr curPtr
+            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+            state <- peek statePtr
+            lastPtr <- peek curPtrPtr
+            codepoint <- peek codepointPtr
+            n <- peek destOffPtr
+            chunkText <- mkText dest n
+            let left      = lastPtr `minusPtr` curPtr
+                remaining = B.drop left bs
+                accum = if T.null chunkText then B.append old remaining  else remaining 
+            return $! case state of 
+              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
+     {-# INLINE decodeChunkToBuffer #-}
+  {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
+  dest <- A.new (len+1) 
+  unsafeIOToST $ 
+     withForeignPtr fp $ \ptr ->
+     with (0::CSize)        $ \destOffPtr ->
+     with (0::CodePoint)    $ \codepointPtr ->
+     with (0::DecoderState) $ \statePtr ->
+     with nullPtr           $ \curPtrPtr ->
+       do let end = ptr `plusPtr` (off + len)
+              curPtr = ptr `plusPtr` off
+          poke curPtrPtr curPtr
+          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
+                                   curPtrPtr end codepointPtr statePtr
+          state <- peek statePtr
+          lastPtr <- peek curPtrPtr
+          codepoint <- peek codepointPtr
+          n <- peek destOffPtr
+          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                         return $! textP arr 0 (fromIntegral n)
+          let left      = lastPtr `minusPtr` curPtr
+              remaining = B.drop left bs
+          return $! (chunkText, remaining)
+{-# INLINE decodeSomeUtf8 #-}
+mkText :: A.MArray s -> CSize -> IO Text
+mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+                                   return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
+                       return 1
+    | otherwise   = do A.unsafeWrite marr i lo
+                       A.unsafeWrite marr (i+1) hi
+                       return 2
+    where n = ord c
+          m = n - 0x10000
+          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+          {-# INLINE shiftR #-}
+{-# INLINE unsafeWrite #-}
+foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr (Ptr Word8) -> Ptr Word8
+    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
+\ No newline at end of file

diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 18ec8ec..0957a7d 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs
@@ -167,6 +167,7 @@ module Pipes.Text (
167	, module Data.Word	167	, module Data.Word
168	, module Pipes.Parse	168	, module Pipes.Parse
169	, module Pipes.Group	169	, module Pipes.Group
		170	, module Pipes.Text.Internal.Codec
170	) where	171	) where
171		172
172	import Control.Exception (throwIO, try)	173	import Control.Exception (throwIO, try)
@@ -196,8 +197,8 @@ import Foreign.C.Error (Errno(Errno), ePIPE)
196	import qualified GHC.IO.Exception as G	197	import qualified GHC.IO.Exception as G
197	import Pipes	198	import Pipes
198	import qualified Pipes.ByteString as PB	199	import qualified Pipes.ByteString as PB
199	import qualified Pipes.Text.Internal as PE	200	import qualified Pipes.Text.Internal.Decoding as PE
200	import Pipes.Text.Codec	201	import Pipes.Text.Internal.Codec
201	import Pipes.Core (respond, Server')	202	import Pipes.Core (respond, Server')
202	import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))	203	import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
203	import qualified Pipes.Group as PG	204	import qualified Pipes.Group as PG


diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs new file mode 100644 index 0000000..4b9367f --- /dev/null +++ b/Pipes/Text/Internal/Codec.hs
@@ -0,0 +1,215 @@
		1
		2	{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
		3	-- \|
		4	-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
		5	-- License: MIT
		6	--
		7	-- Parts of this code were taken from enumerator and conduits, and adapted for pipes.
		8
		9	module Pipes.Text.Internal.Codec
		10	( Decoding(..)
		11	, streamDecodeUtf8
		12	, decodeSomeUtf8
		13	, Codec(..)
		14	, TextException(..)
		15	, utf8
		16	, utf16_le
		17	, utf16_be
		18	, utf32_le
		19	, utf32_be
		20	) where
		21
		22	import Data.Bits ((.&.))
		23	import Data.Char (ord)
		24	import Data.ByteString as B
		25	import Data.ByteString (ByteString)
		26	import Data.ByteString.Internal as B
		27	import Data.ByteString.Char8 as B8
		28	import Data.Text (Text)
		29	import qualified Data.Text as T
		30	import qualified Data.Text.Encoding as TE
		31	import Data.Text.Encoding.Error ()
		32	import GHC.Word (Word8, Word32)
		33	import qualified Data.Text.Array as A
		34	import Data.Word (Word8, Word16)
		35	import System.IO.Unsafe (unsafePerformIO)
		36	import qualified Control.Exception as Exc
		37	import Data.Bits ((.&.), (.\|.), shiftL)
		38	import Data.Typeable
		39	import Control.Arrow (first)
		40	import Data.Maybe (catMaybes)
		41	import Pipes.Text.Internal.Decoding
		42	import Pipes
		43	-- \| A specific character encoding.
		44	--
		45	-- Since 0.3.0
		46	data Codec = Codec
		47	{ codecName :: Text
		48	, codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
		49	, codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
		50	}
		51
		52	instance Show Codec where
		53	showsPrec d c = showParen (d > 10) $
		54	showString "Codec " . shows (codecName c)
		55
		56	data TextException = DecodeException Codec Word8
		57	\| EncodeException Codec Char
		58	\| LengthExceeded Int
		59	\| TextException Exc.SomeException
		60	deriving (Show, Typeable)
		61	instance Exc.Exception TextException
		62
		63
		64	toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
		65	-> (ByteString -> Decoding)
		66	toDecoding op = loop B.empty where
		67	loop !extra bs0 = case op (B.append extra bs0) of
		68	(txt, Right bs) -> Some txt bs (loop bs)
		69	(txt, Left (_,bs)) -> Other txt bs
		70	-- To do: toDecoding should be inlined in each of the 'Codec' definitions
		71	-- or else Codec changed to the conduit/enumerator definition. We have
		72	-- altered it to use 'streamDecodeUtf8'
		73
		74	splitSlowly :: (ByteString -> Text)
		75	-> ByteString
		76	-> (Text, Either (TextException, ByteString) ByteString)
		77	splitSlowly dec bytes = valid where
		78	valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
		79	splits 0 = [(B.empty, bytes)]
		80	splits n = B.splitAt n bytes : splits (n - 1)
		81	decFirst (a, b) = case tryEvaluate (dec a) of
		82	Left _ -> Nothing
		83	Right text -> let trouble = case tryEvaluate (dec b) of
		84	Left exc -> Left (TextException exc, b)
		85	Right _ -> Right B.empty
		86	in Just (text, trouble) -- this case shouldn't occur,
		87	-- since splitSlowly is only called
		88	-- when parsing failed somewhere
		89
		90	utf8 :: Codec
		91	utf8 = Codec name enc (toDecoding dec) where
		92	name = T.pack "UTF-8"
		93	enc text = (TE.encodeUtf8 text, Nothing)
		94	dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
		95
		96	-- -- Whether the given byte is a continuation byte.
		97	-- isContinuation byte = byte .&. 0xC0 == 0x80
		98	--
		99	-- -- The number of continuation bytes needed by the given
		100	-- -- non-continuation byte. Returns -1 for an illegal UTF-8
		101	-- -- non-continuation byte and the whole split quickly must fail so
		102	-- -- as the input is passed to TE.decodeUtf8, which will issue a
		103	-- -- suitable error.
		104	-- required x0
		105	-- \| x0 .&. 0x80 == 0x00 = 0
		106	-- \| x0 .&. 0xE0 == 0xC0 = 1
		107	-- \| x0 .&. 0xF0 == 0xE0 = 2
		108	-- \| x0 .&. 0xF8 == 0xF0 = 3
		109	-- \| otherwise = -1
		110	--
		111	-- splitQuickly bytes
		112	-- \| B.null l \|\| req == -1 = Nothing
		113	-- \| req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
		114	-- \| otherwise = Just (TE.decodeUtf8 l', r')
		115	-- where
		116	-- (l, r) = B.spanEnd isContinuation bytes
		117	-- req = required (B.last l)
		118	-- l' = B.init l
		119	-- r' = B.cons (B.last l) r
		120
		121
		122	utf16_le :: Codec
		123	utf16_le = Codec name enc (toDecoding dec) where
		124	name = T.pack "UTF-16-LE"
		125	enc text = (TE.encodeUtf16LE text, Nothing)
		126	dec bytes = case splitQuickly bytes of
		127	Just (text, extra) -> (text, Right extra)
		128	Nothing -> splitSlowly TE.decodeUtf16LE bytes
		129
		130	splitQuickly bytes = maybeDecode (loop 0) where
		131	maxN = B.length bytes
		132
		133	loop n \| n == maxN = decodeAll
		134	\| (n + 1) == maxN = decodeTo n
		135	loop n = let
		136	req = utf16Required
		137	(B.index bytes n)
		138	(B.index bytes (n + 1))
		139	decodeMore = loop $! n + req
		140	in if n + req > maxN
		141	then decodeTo n
		142	else decodeMore
		143
		144	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
		145	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
		146
		147	utf16_be :: Codec
		148	utf16_be = Codec name enc (toDecoding dec) where
		149	name = T.pack "UTF-16-BE"
		150	enc text = (TE.encodeUtf16BE text, Nothing)
		151	dec bytes = case splitQuickly bytes of
		152	Just (text, extra) -> (text, Right extra)
		153	Nothing -> splitSlowly TE.decodeUtf16BE bytes
		154
		155	splitQuickly bytes = maybeDecode (loop 0) where
		156	maxN = B.length bytes
		157
		158	loop n \| n == maxN = decodeAll
		159	\| (n + 1) == maxN = decodeTo n
		160	loop n = let
		161	req = utf16Required
		162	(B.index bytes (n + 1))
		163	(B.index bytes n)
		164	decodeMore = loop $! n + req
		165	in if n + req > maxN
		166	then decodeTo n
		167	else decodeMore
		168
		169	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
		170	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
		171
		172	utf16Required :: Word8 -> Word8 -> Int
		173	utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
		174	x :: Word16
		175	x = (fromIntegral x1 `shiftL` 8) .\|. fromIntegral x0
		176
		177
		178	utf32_le :: Codec
		179	utf32_le = Codec name enc (toDecoding dec) where
		180	name = T.pack "UTF-32-LE"
		181	enc text = (TE.encodeUtf32LE text, Nothing)
		182	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
		183	Just (text, extra) -> (text, Right extra)
		184	Nothing -> splitSlowly TE.decodeUtf32LE bs
		185
		186
		187	utf32_be :: Codec
		188	utf32_be = Codec name enc (toDecoding dec) where
		189	name = T.pack "UTF-32-BE"
		190	enc text = (TE.encodeUtf32BE text, Nothing)
		191	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
		192	Just (text, extra) -> (text, Right extra)
		193	Nothing -> splitSlowly TE.decodeUtf32BE bs
		194
		195	utf32SplitBytes :: (ByteString -> Text)
		196	-> ByteString
		197	-> Maybe (Text, ByteString)
		198	utf32SplitBytes dec bytes = split where
		199	split = maybeDecode (dec toDecode, extra)
		200	len = B.length bytes
		201	lenExtra = mod len 4
		202
		203	lenToDecode = len - lenExtra
		204	(toDecode, extra) = if lenExtra == 0
		205	then (bytes, B.empty)
		206	else B.splitAt lenToDecode bytes
		207
		208
		209	tryEvaluate :: a -> Either Exc.SomeException a
		210	tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
		211
		212	maybeDecode :: (a, b) -> Maybe (a, b)
		213	maybeDecode (a, b) = case tryEvaluate a of
		214	Left _ -> Nothing
		215	Right _ -> Just (a, b)


diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs new file mode 100644 index 0000000..531104a --- /dev/null +++ b/Pipes/Text/Internal/Decoding.hs
@@ -0,0 +1,147 @@
		1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
		2	{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
		3	{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
		4
		5	-- This module lifts assorted materials from Brian O'Sullivan's text package
		6	-- especially Data.Text.Encoding in order to define a pipes-appropriate
		7	-- streamDecodeUtf8
		8	module Pipes.Text.Internal.Decoding
		9	( Decoding(..)
		10	, streamDecodeUtf8
		11	, decodeSomeUtf8
		12	) where
		13	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
		14	import Control.Monad.ST (ST, runST)
		15	import Data.Bits ((.&.))
		16	import Data.ByteString as B
		17	import Data.ByteString (ByteString)
		18	import Data.ByteString.Internal as B
		19	import Data.ByteString.Char8 as B8
		20	import Data.Text (Text)
		21	import qualified Data.Text as T
		22	import qualified Data.Text.Encoding as TE
		23	import Data.Text.Encoding.Error ()
		24	import Data.Text.Internal (Text, textP)
		25	import Foreign.C.Types (CSize)
		26	import Foreign.ForeignPtr (withForeignPtr)
		27	import Foreign.Marshal.Utils (with)
		28	import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
		29	import Foreign.Storable (Storable, peek, poke)
		30	import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
		31	import GHC.Word (Word8, Word32)
		32	import qualified Data.Text.Array as A
		33	import Data.Word (Word8, Word16)
		34	import System.IO.Unsafe (unsafePerformIO)
		35	import qualified Control.Exception as Exc
		36	import Data.Bits ((.&.), (.\|.), shiftL)
		37	import Data.Typeable
		38	import Control.Arrow (first)
		39	import Data.Maybe (catMaybes)
		40	#include "pipes_text_cbits.h"
		41
		42
		43
		44	-- \| A stream oriented decoding result.
		45	data Decoding = Some Text ByteString (ByteString -> Decoding)
		46	\| Other Text ByteString
		47	instance Show Decoding where
		48	showsPrec d (Some t bs _) = showParen (d > prec) $
		49	showString "Some " . showsPrec prec' t .
		50	showChar ' ' . showsPrec prec' bs .
		51	showString " _"
		52	where prec = 10; prec' = prec + 1
		53	showsPrec d (Other t bs) = showParen (d > prec) $
		54	showString "Other " . showsPrec prec' t .
		55	showChar ' ' . showsPrec prec' bs .
		56	showString " _"
		57	where prec = 10; prec' = prec + 1
		58
		59	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
		60	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
		61
		62	streamDecodeUtf8 :: ByteString -> Decoding
		63	streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
		64	where
		65	decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
		66	decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
		67	runST $ do marray <- A.new (len+1)
		68	unsafeIOToST (decodeChunkToBuffer marray)
		69	where
		70	decodeChunkToBuffer :: A.MArray s -> IO Decoding
		71	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
		72	with (0::CSize) $ \destOffPtr ->
		73	with codepoint0 $ \codepointPtr ->
		74	with state0 $ \statePtr ->
		75	with nullPtr $ \curPtrPtr ->
		76	do let end = ptr `plusPtr` (off + len)
		77	curPtr = ptr `plusPtr` off
		78	poke curPtrPtr curPtr
		79	c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
		80	state <- peek statePtr
		81	lastPtr <- peek curPtrPtr
		82	codepoint <- peek codepointPtr
		83	n <- peek destOffPtr
		84	chunkText <- mkText dest n
		85	let left = lastPtr `minusPtr` curPtr
		86	remaining = B.drop left bs
		87	accum = if T.null chunkText then B.append old remaining else remaining
		88	return $! case state of
		89	UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
		90	_ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
		91	{-# INLINE decodeChunkToBuffer #-}
		92	{-# INLINE decodeChunkUtf8 #-}
		93	{-# INLINE streamDecodeUtf8 #-}
		94
		95	decodeSomeUtf8 :: ByteString -> (Text, ByteString)
		96	decodeSomeUtf8 bs@(PS fp off len) = runST $ do
		97	dest <- A.new (len+1)
		98	unsafeIOToST $
		99	withForeignPtr fp $ \ptr ->
		100	with (0::CSize) $ \destOffPtr ->
		101	with (0::CodePoint) $ \codepointPtr ->
		102	with (0::DecoderState) $ \statePtr ->
		103	with nullPtr $ \curPtrPtr ->
		104	do let end = ptr `plusPtr` (off + len)
		105	curPtr = ptr `plusPtr` off
		106	poke curPtrPtr curPtr
		107	c_decode_utf8_with_state (A.maBA dest) destOffPtr
		108	curPtrPtr end codepointPtr statePtr
		109	state <- peek statePtr
		110	lastPtr <- peek curPtrPtr
		111	codepoint <- peek codepointPtr
		112	n <- peek destOffPtr
		113	chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		114	return $! textP arr 0 (fromIntegral n)
		115	let left = lastPtr `minusPtr` curPtr
		116	remaining = B.drop left bs
		117	return $! (chunkText, remaining)
		118	{-# INLINE decodeSomeUtf8 #-}
		119
		120	mkText :: A.MArray s -> CSize -> IO Text
		121	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
		122	return $! textP arr 0 (fromIntegral n)
		123	{-# INLINE mkText #-}
		124
		125	ord :: Char -> Int
		126	ord (C# c#) = I# (ord# c#)
		127	{-# INLINE ord #-}
		128
		129	unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
		130	unsafeWrite marr i c
		131	\| n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
		132	return 1
		133	\| otherwise = do A.unsafeWrite marr i lo
		134	A.unsafeWrite marr (i+1) hi
		135	return 2
		136	where n = ord c
		137	m = n - 0x10000
		138	lo = fromIntegral $ (m `shiftR` 10) + 0xD800
		139	hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
		140	shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
		141	{-# INLINE shiftR #-}
		142	{-# INLINE unsafeWrite #-}
		143
		144	foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
		145	:: MutableByteArray# s -> Ptr CSize
		146	-> Ptr (Ptr Word8) -> Ptr Word8
		147	-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file