variant using text internals in place of text streamDecodeUtf8

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2013-12-23 13:02:49 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2013-12-23 13:02:49 -0500
commit: 8c48280926efffc0ca52a5d9ca796d639d053379 (patch)
tree: 972ca8955b5581d634663424e973e56fa4487fe5 /Pipes
parent: 8853a440e37523bae8cb46827d0d2d356bad5c46 (diff)
download: text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz
text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst
text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip
2 files changed, 216 insertions, 77 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index a5859a3..6845dd3 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -81,10 +81,6 @@ module Pipes.Text  (
    filter,
    scan,
    encodeUtf8,
-#if MIN_VERSION_text(0,11,4)
-    pipeDecodeUtf8,
-    pipeDecodeUtf8With,
-#endif
    pack,
    unpack,
    toCaseFold,
@@ -119,10 +115,8 @@ module Pipes.Text  (
    group,
    lines,
    words,
-#if MIN_VERSION_text(0,11,4)
    decodeUtf8,
    decodeUtf8With,
-#endif
    -- * Transformations
    intersperse,
    
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G
 import Pipes
 import qualified Pipes.ByteString as PB
 import qualified Pipes.ByteString.Parse as PBP
+import qualified Pipes.Text.Internal as PE
 import Pipes.Text.Parse (
    nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )
 import Pipes.Core (respond, Server')
@@ -214,43 +209,60 @@ fromLazy  = foldrChunks (\e a -> yield e >> a) (return ())
 {-# INLINABLE fromLazy #-}
 -- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer' Text m ()
+stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
 stdin = fromHandle IO.stdin
 {-# INLINABLE stdin #-}
 {-| Convert a 'IO.Handle' into a text stream using a text size 
    determined by the good sense of the text library. 
 -}
-fromHandle :: MonadIO m => IO.Handle -> Producer' Text m ()
+fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
-#if MIN_VERSION_text(0,11,4)
+-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
-fromHandle h = go TE.streamDecodeUtf8 where
+-- if only so that mistakes can be concentrated in one place.
+-- This modifies something that was faster on an earlier iteration.
+-- Note also that the `text` replacement system is being ignored;
+-- with a replacement scheme one could have `Producer Text m ()`
+-- the relation to the replacement business needs to be thought out.
+-- The complicated type seems overmuch for the toy stdin above
+fromHandle h = go PE.streamDecodeUtf8 B.empty where
  act = B.hGetSome h defaultChunkSize
-  go dec = do chunk <- liftIO act
+  go dec old = do chunk <- liftIO act
-              case dec chunk of 
+                  if B.null chunk 
-                TE.Some text _ dec' -> do yield text
+                    then if B.null old then return (return ())
-                                          unless (B.null chunk) (go dec')
+                                       else return (yield old >> return ())
+                    else case dec chunk of 
+                           PE.Some text bs dec' -> 
+                              if T.null text then go dec' (B.append old bs) 
+                                             else do yield text
+                                                     go dec' B.empty
+                           PE.Other text bs ->
+                              if T.null text then return (do yield old
+                                                             yield bs
+                                                             PB.fromHandle h)
+                                             else do yield text
+                                                     return (do yield bs
+                                                                PB.fromHandle h)
 {-# INLINE fromHandle#-}
 -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
 -- the dedicated Text IO function 'hGetChunk' ;
 -- this way "runEffect $ PT.fromHandle hIn  >->  PT.toHandle hOut"
 -- runs the same as the conduit equivalent, only slightly slower 
 -- than "runEffect $ PB.fromHandle hIn  >->  PB.toHandle hOut"
-#else
+-- #else
-fromHandle h = go where
+-- fromHandle h = go where
-    go = do txt <- liftIO (T.hGetChunk h)
+--     go = do txt <- liftIO (T.hGetChunk h)
-            unless (T.null txt) $ do yield txt
+--             unless (T.null txt) $ do yield txt
-                                     go
+--                                      go
-{-# INLINABLE fromHandle#-}
+-- {-# INLINABLE fromHandle#-}
-#endif
+-- #endif
 {-| Stream text from a file using Pipes.Safe
 >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
 MAIN = PUTSTRLN "HELLO WORLD"
 -}
-readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m ()
+readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
 readFile file = Safe.withFile file IO.ReadMode fromHandle
 {-# INLINABLE readFile #-}
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n
 count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
 {-# INLINABLE count #-}
-#if MIN_VERSION_text(0,11,4)
 -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
 -- into a Pipe of Text
 decodeUtf8
  :: Monad m
  => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8 = go TE.streamDecodeUtf8
+decodeUtf8 = decodeUtf8With Nothing
-  where go dec p = do
-            x <- lift (next p)
-            case x of
-                Left r -> return (return r)
-                Right (chunk, p') -> do
-                    let TE.Some text l dec' = dec chunk
-                    if B.null l
-                      then do
-                          yield text
-                          go dec' p'
-                      else return $ do
-                          yield l
-                          p'
 {-# INLINEABLE decodeUtf8 #-}
 -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
 -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
 -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
-decodeUtf8With 
+decodeUtf8With
  :: Monad m  
-  => TE.OnDecodeError 
+  => Maybe TE.OnDecodeError 
  -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
+decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where 
-  where go dec p = do
+  go dec old p = do
-            x <- lift (next p)
+    x <- lift (next p)
-            case x of
+    case x of
-                Left r -> return (return r)
+      Left r -> if B.null old then return (return r)
-                Right (chunk, p') -> do
+                              else return (do yield old 
-                    let TE.Some text l dec' = dec chunk
+                                              return r)
-                    if B.null l
+      Right (chunk, p') -> 
-                      then do
+        case dec chunk of 
-                          yield text
+          PE.Some text l dec' -> 
-                          go dec' p'
+            if T.null text then go dec' (B.append old l) p'
-                      else return $ do
+                           else do yield text
-                          yield l
+                                   go dec' B.empty p'
-                          p'
+          PE.Other text bs ->
+            if T.null text then return (do yield old 
+                                           yield bs
+                                           p')
+                           else do yield text
+                                   return (do yield bs
+                                              p')
 {-# INLINEABLE decodeUtf8With #-}
-- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise
-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise
-- only few bytes will only be moved from one chunk to the next before decoding.
-pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r
-pipeDecodeUtf8 = go TE.streamDecodeUtf8
-  where go dec = do chunk <- await
-                    case dec chunk of 
-                      TE.Some text l dec' -> do yield text
-                                                go dec'
-{-# INLINEABLE pipeDecodeUtf8 #-}
-- | A simple pipe from 'ByteString' to 'Text' using a replacement function.
-pipeDecodeUtf8With 
-  :: Monad m  
-  => TE.OnDecodeError 
-  -> Pipe ByteString Text m r 
-pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
-  where go dec = do chunk <- await
-                    case dec chunk of 
-                      TE.Some text l dec' -> do yield text
-                                                go dec'
-{-# INLINEABLE pipeDecodeUtf8With #-}
-#endif
 -- | Splits a 'Producer' after the given number of characters
 splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
new file mode 100644
index 0000000..05d9887
--- /dev/null
+++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,157 @@
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
+    UnliftedFFITypes #-}
+-- This module lifts material from Brian O'Sullivan's text package 
+-- especially Data.Text.Encoding in order to define a pipes-appropriate
+-- streamDecodeUtf8
+module Pipes.Text.Internal 
+    ( Decoding(..)
+    , streamDecodeUtf8With
+    , streamDecodeUtf8
+    ) where
+import Control.Exception (evaluate, try)
+#if __GLASGOW_HASKELL__ >= 702
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Control.Monad.ST (ST, runST)
+#else
+import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
+#endif
+import Data.Bits ((.&.))
+import Data.ByteString as B
+import Data.ByteString.Internal as B
+import Data.Text ()
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Internal (Text(..), safe, textP)
+import Data.Word (Word8, Word32)
+import Foreign.C.Types (CSize)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base hiding (ord)
+import GHC.Word
+import qualified Data.Text.Array as A
+import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
+import GHC.Word (Word8(..), Word16(..), Word32(..))
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+#include "pipes_text_cbits.h"
+-- | A stream oriented decoding result.
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+              | Other Text ByteString
+instance Show Decoding where
+    showsPrec d (Some t bs _) = showParen (d > prec) $
+                                showString "Some " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+    showsPrec d (Other t bs)  = showParen (d > prec) $
+                                showString "Other " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
+      
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- encoded text that is known to be valid.
+--
+-- If the input contains any invalid UTF-8 data, an exception will be
+-- thrown (either by this function or a continuation) that cannot be
+-- caught in pure code.  For more control over the handling of invalid
+-- data, use 'streamDecodeUtf8With'.
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- encoded text.
+streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
+streamDecodeUtf8With mErr = case mErr of 
+    Nothing    -> decodeWith False strictDecode 
+    Just onErr -> decodeWith True onErr 
+ where
+  -- We create a slightly larger than necessary buffer to accommodate a
+  -- potential surrogate pair started in the last buffer
+ decodeWith replace onErr = decodeChunk 0 0
+  where
+  decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
+  decodeChunk codepoint0 state0 bs@(PS fp off len) =
+    runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+   where
+    decodeChunkToBuffer :: A.MArray s -> IO Decoding
+    decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+      with (0::CSize) $ \destOffPtr ->
+      with codepoint0 $ \codepointPtr ->
+      with state0 $ \statePtr ->
+      with nullPtr $ \curPtrPtr ->
+        let end = ptr `plusPtr` (off + len)
+            loop curPtr = do
+              poke curPtrPtr curPtr
+              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+                         curPtrPtr end codepointPtr statePtr
+              state <- peek statePtr
+              case state of
+                UTF8_REJECT ->  
+                  -- We encountered an encoding error
+                 if replace 
+                 then do 
+                  x <- peek curPtr'
+                  case onErr desc (Just x) of
+                    Nothing -> loop $ curPtr' `plusPtr` 1
+                    Just c -> do
+                      destOff <- peek destOffPtr
+                      w <- unsafeSTToIO $
+                           unsafeWrite dest (fromIntegral destOff) (safe c)
+                      poke destOffPtr (destOff + fromIntegral w)
+                      poke statePtr 0
+                      loop $ curPtr' `plusPtr` 1
+                 else do 
+                  n <- peek destOffPtr 
+                  chunkText <- unsafeSTToIO $ do
+                      arr <- A.unsafeFreeze dest
+                      return $! textP arr 0 (fromIntegral n)
+                  lastPtr <- peek curPtrPtr
+                  let left = lastPtr `minusPtr` curPtr
+                  return $ Other chunkText (B.drop left bs)
+                _ -> do
+                  -- We encountered the end of the buffer while decoding
+                  n <- peek destOffPtr
+                  codepoint <- peek codepointPtr
+                  chunkText <- unsafeSTToIO $ do
+                      arr <- A.unsafeFreeze dest
+                      return $! textP arr 0 (fromIntegral n)
+                  lastPtr <- peek curPtrPtr
+                  let left = lastPtr `minusPtr` curPtr
+                  return $ Some chunkText (B.drop left bs)
+                           (decodeChunk codepoint state)
+        in loop (ptr `plusPtr` off)
+  desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+    | n < 0x10000 = do 
+        A.unsafeWrite marr i (fromIntegral n)
+        return 1
+    | otherwise = do
+        A.unsafeWrite marr i lo
+        A.unsafeWrite marr (i+1) hi
+        return 2
+    where n = ord c
+          m = n - 0x10000
+          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+{-# INLINE unsafeWrite #-}
+foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr (Ptr Word8) -> Ptr Word8
+    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
+\ No newline at end of file
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2013-12-23 13:02:49 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2013-12-23 13:02:49 -0500
commit	8c48280926efffc0ca52a5d9ca796d639d053379 (patch)
tree	972ca8955b5581d634663424e973e56fa4487fe5 /Pipes
parent	8853a440e37523bae8cb46827d0d2d356bad5c46 (diff)
download	text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip

diff --git a/Pipes/Text.hs b/Pipes/Text.hs index a5859a3..6845dd3 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs
@@ -81,10 +81,6 @@ module Pipes.Text (
81	filter,	81	filter,
82	scan,	82	scan,
83	encodeUtf8,	83	encodeUtf8,
84	#if MIN_VERSION_text(0,11,4)
85	pipeDecodeUtf8,
86	pipeDecodeUtf8With,
87	#endif
88	pack,	84	pack,
89	unpack,	85	unpack,
90	toCaseFold,	86	toCaseFold,
@@ -119,10 +115,8 @@ module Pipes.Text (
119	group,	115	group,
120	lines,	116	lines,
121	words,	117	words,
122	#if MIN_VERSION_text(0,11,4)
123	decodeUtf8,	118	decodeUtf8,
124	decodeUtf8With,	119	decodeUtf8With,
125	#endif
126	-- * Transformations	120	-- * Transformations
127	intersperse,	121	intersperse,
128		122
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G
167	import Pipes	161	import Pipes
168	import qualified Pipes.ByteString as PB	162	import qualified Pipes.ByteString as PB
169	import qualified Pipes.ByteString.Parse as PBP	163	import qualified Pipes.ByteString.Parse as PBP
		164	import qualified Pipes.Text.Internal as PE
170	import Pipes.Text.Parse (	165	import Pipes.Text.Parse (
171	nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )	166	nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )
172	import Pipes.Core (respond, Server')	167	import Pipes.Core (respond, Server')
@@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
214	{-# INLINABLE fromLazy #-}	209	{-# INLINABLE fromLazy #-}
215		210
216	-- \| Stream text from 'stdin'	211	-- \| Stream text from 'stdin'
217	stdin :: MonadIO m => Producer' Text m ()	212	stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
218	stdin = fromHandle IO.stdin	213	stdin = fromHandle IO.stdin
219	{-# INLINABLE stdin #-}	214	{-# INLINABLE stdin #-}
220		215
221	{-\| Convert a 'IO.Handle' into a text stream using a text size	216	{-\| Convert a 'IO.Handle' into a text stream using a text size
222	determined by the good sense of the text library.	217	determined by the good sense of the text library.
223
224	-}	218	-}
225		219
226	fromHandle :: MonadIO m => IO.Handle -> Producer' Text m ()	220	fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
227	#if MIN_VERSION_text(0,11,4)	221	-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
228	fromHandle h = go TE.streamDecodeUtf8 where	222	-- if only so that mistakes can be concentrated in one place.
		223	-- This modifies something that was faster on an earlier iteration.
		224	-- Note also that the `text` replacement system is being ignored;
		225	-- with a replacement scheme one could have `Producer Text m ()`
		226	-- the relation to the replacement business needs to be thought out.
		227	-- The complicated type seems overmuch for the toy stdin above
		228	fromHandle h = go PE.streamDecodeUtf8 B.empty where
229	act = B.hGetSome h defaultChunkSize	229	act = B.hGetSome h defaultChunkSize
230	go dec = do chunk <- liftIO act	230	go dec old = do chunk <- liftIO act
231	case dec chunk of	231	if B.null chunk
232	TE.Some text _ dec' -> do yield text	232	then if B.null old then return (return ())
233	unless (B.null chunk) (go dec')	233	else return (yield old >> return ())
		234	else case dec chunk of
		235	PE.Some text bs dec' ->
		236	if T.null text then go dec' (B.append old bs)
		237	else do yield text
		238	go dec' B.empty
		239	PE.Other text bs ->
		240	if T.null text then return (do yield old
		241	yield bs
		242	PB.fromHandle h)
		243	else do yield text
		244	return (do yield bs
		245	PB.fromHandle h)
234	{-# INLINE fromHandle#-}	246	{-# INLINE fromHandle#-}
235	-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as	247	-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
236	-- the dedicated Text IO function 'hGetChunk' ;	248	-- the dedicated Text IO function 'hGetChunk' ;
237	-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"	249	-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
238	-- runs the same as the conduit equivalent, only slightly slower	250	-- runs the same as the conduit equivalent, only slightly slower
239	-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"	251	-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
240	#else	252	-- #else
241	fromHandle h = go where	253	-- fromHandle h = go where
242	go = do txt <- liftIO (T.hGetChunk h)	254	-- go = do txt <- liftIO (T.hGetChunk h)
243	unless (T.null txt) $ do yield txt	255	-- unless (T.null txt) $ do yield txt
244	go	256	-- go
245	{-# INLINABLE fromHandle#-}	257	-- {-# INLINABLE fromHandle#-}
246	#endif	258	-- #endif
247	{-\| Stream text from a file using Pipes.Safe	259	{-\| Stream text from a file using Pipes.Safe
248		260
249	>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout	261	>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
250	MAIN = PUTSTRLN "HELLO WORLD"	262	MAIN = PUTSTRLN "HELLO WORLD"
251	-}	263	-}
252		264
253	readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m ()	265	readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
254	readFile file = Safe.withFile file IO.ReadMode fromHandle	266	readFile file = Safe.withFile file IO.ReadMode fromHandle
255	{-# INLINABLE readFile #-}	267	{-# INLINABLE readFile #-}
256		268
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n
610	count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))	622	count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
611	{-# INLINABLE count #-}	623	{-# INLINABLE count #-}
612		624
613	#if MIN_VERSION_text(0,11,4)
614	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded	625	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
615	-- into a Pipe of Text	626	-- into a Pipe of Text
616	decodeUtf8	627	decodeUtf8
617	:: Monad m	628	:: Monad m
618	=> Producer ByteString m r -> Producer Text m (Producer ByteString m r)	629	=> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
619	decodeUtf8 = go TE.streamDecodeUtf8	630	decodeUtf8 = decodeUtf8With Nothing
620	where go dec p = do
621	x <- lift (next p)
622	case x of
623	Left r -> return (return r)
624	Right (chunk, p') -> do
625	let TE.Some text l dec' = dec chunk
626	if B.null l
627	then do
628	yield text
629	go dec' p'
630	else return $ do
631	yield l
632	p'
633	{-# INLINEABLE decodeUtf8 #-}	631	{-# INLINEABLE decodeUtf8 #-}
634		632
635	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded	633	-- \| Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
636	-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@	634	-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
637	-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"	635	-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
638	decodeUtf8With	636	decodeUtf8With
639	:: Monad m	637	:: Monad m
640	=> TE.OnDecodeError	638	=> Maybe TE.OnDecodeError
641	-> Producer ByteString m r -> Producer Text m (Producer ByteString m r)	639	-> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
642	decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)	640	decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
643	where go dec p = do	641	go dec old p = do
644	x <- lift (next p)	642	x <- lift (next p)
645	case x of	643	case x of
646	Left r -> return (return r)	644	Left r -> if B.null old then return (return r)
647	Right (chunk, p') -> do	645	else return (do yield old
648	let TE.Some text l dec' = dec chunk	646	return r)
649	if B.null l	647	Right (chunk, p') ->
650	then do	648	case dec chunk of
651	yield text	649	PE.Some text l dec' ->
652	go dec' p'	650	if T.null text then go dec' (B.append old l) p'
653	else return $ do	651	else do yield text
654	yield l	652	go dec' B.empty p'
655	p'	653	PE.Other text bs ->
		654	if T.null text then return (do yield old
		655	yield bs
		656	p')
		657	else do yield text
		658	return (do yield bs
		659	p')
656	{-# INLINEABLE decodeUtf8With #-}	660	{-# INLINEABLE decodeUtf8With #-}
657		661
658	-- \| A simple pipe from 'ByteString' to 'Text'; a decoding error will arise	662
659	-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise
660	-- only few bytes will only be moved from one chunk to the next before decoding.
661	pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r
662	pipeDecodeUtf8 = go TE.streamDecodeUtf8
663	where go dec = do chunk <- await
664	case dec chunk of
665	TE.Some text l dec' -> do yield text
666	go dec'
667	{-# INLINEABLE pipeDecodeUtf8 #-}
668
669	-- \| A simple pipe from 'ByteString' to 'Text' using a replacement function.
670	pipeDecodeUtf8With
671	:: Monad m
672	=> TE.OnDecodeError
673	-> Pipe ByteString Text m r
674	pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
675	where go dec = do chunk <- await
676	case dec chunk of
677	TE.Some text l dec' -> do yield text
678	go dec'
679	{-# INLINEABLE pipeDecodeUtf8With #-}
680	#endif
681		663
682	-- \| Splits a 'Producer' after the given number of characters	664	-- \| Splits a 'Producer' after the given number of characters
683	splitAt	665	splitAt


diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..05d9887 --- /dev/null +++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,157 @@
		1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
		2	UnliftedFFITypes #-}
		3	-- This module lifts material from Brian O'Sullivan's text package
		4	-- especially Data.Text.Encoding in order to define a pipes-appropriate
		5	-- streamDecodeUtf8
		6	module Pipes.Text.Internal
		7	( Decoding(..)
		8	, streamDecodeUtf8With
		9	, streamDecodeUtf8
		10	) where
		11
		12	import Control.Exception (evaluate, try)
		13	#if __GLASGOW_HASKELL__ >= 702
		14	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
		15	import Control.Monad.ST (ST, runST)
		16	#else
		17	import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
		18	#endif
		19	import Data.Bits ((.&.))
		20	import Data.ByteString as B
		21	import Data.ByteString.Internal as B
		22	import Data.Text ()
		23	import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
		24	import Data.Text.Internal (Text(..), safe, textP)
		25	import Data.Word (Word8, Word32)
		26	import Foreign.C.Types (CSize)
		27	import Foreign.ForeignPtr (withForeignPtr)
		28	import Foreign.Marshal.Utils (with)
		29	import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
		30	import Foreign.Storable (Storable, peek, poke)
		31	import GHC.Base hiding (ord)
		32	import GHC.Word
		33	import qualified Data.Text.Array as A
		34	import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
		35	import GHC.Word (Word8(..), Word16(..), Word32(..))
		36
		37	import Data.Text.Unsafe (unsafeDupablePerformIO)
		38
		39	#include "pipes_text_cbits.h"
		40
		41	-- \| A stream oriented decoding result.
		42	data Decoding = Some Text ByteString (ByteString -> Decoding)
		43	\| Other Text ByteString
		44	instance Show Decoding where
		45	showsPrec d (Some t bs _) = showParen (d > prec) $
		46	showString "Some " . showsPrec prec' t .
		47	showChar ' ' . showsPrec prec' bs .
		48	showString " _"
		49	where prec = 10; prec' = prec + 1
		50	showsPrec d (Other t bs) = showParen (d > prec) $
		51	showString "Other " . showsPrec prec' t .
		52	showChar ' ' . showsPrec prec' bs .
		53	showString " _"
		54	where prec = 10; prec' = prec + 1
		55
		56	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
		57	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
		58
		59	-- \| Decode, in a stream oriented way, a 'ByteString' containing UTF-8
		60	-- encoded text that is known to be valid.
		61	--
		62	-- If the input contains any invalid UTF-8 data, an exception will be
		63	-- thrown (either by this function or a continuation) that cannot be
		64	-- caught in pure code. For more control over the handling of invalid
		65	-- data, use 'streamDecodeUtf8With'.
		66	streamDecodeUtf8 :: ByteString -> Decoding
		67	streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
		68
		69	-- \| Decode, in a stream oriented way, a 'ByteString' containing UTF-8
		70	-- encoded text.
		71	streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
		72	streamDecodeUtf8With mErr = case mErr of
		73	Nothing -> decodeWith False strictDecode
		74	Just onErr -> decodeWith True onErr
		75	where
		76	-- We create a slightly larger than necessary buffer to accommodate a
		77	-- potential surrogate pair started in the last buffer
		78	decodeWith replace onErr = decodeChunk 0 0
		79	where
		80	decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
		81	decodeChunk codepoint0 state0 bs@(PS fp off len) =
		82	runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
		83	where
		84	decodeChunkToBuffer :: A.MArray s -> IO Decoding
		85	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
		86	with (0::CSize) $ \destOffPtr ->
		87	with codepoint0 $ \codepointPtr ->
		88	with state0 $ \statePtr ->
		89	with nullPtr $ \curPtrPtr ->
		90	let end = ptr `plusPtr` (off + len)
		91	loop curPtr = do
		92	poke curPtrPtr curPtr
		93	curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
		94	curPtrPtr end codepointPtr statePtr
		95	state <- peek statePtr
		96	case state of
		97	UTF8_REJECT ->
		98	-- We encountered an encoding error
		99	if replace
		100	then do
		101	x <- peek curPtr'
		102	case onErr desc (Just x) of
		103	Nothing -> loop $ curPtr' `plusPtr` 1
		104	Just c -> do
		105	destOff <- peek destOffPtr
		106	w <- unsafeSTToIO $
		107	unsafeWrite dest (fromIntegral destOff) (safe c)
		108	poke destOffPtr (destOff + fromIntegral w)
		109	poke statePtr 0
		110	loop $ curPtr' `plusPtr` 1
		111	else do
		112	n <- peek destOffPtr
		113	chunkText <- unsafeSTToIO $ do
		114	arr <- A.unsafeFreeze dest
		115	return $! textP arr 0 (fromIntegral n)
		116	lastPtr <- peek curPtrPtr
		117	let left = lastPtr `minusPtr` curPtr
		118	return $ Other chunkText (B.drop left bs)
		119	_ -> do
		120	-- We encountered the end of the buffer while decoding
		121	n <- peek destOffPtr
		122	codepoint <- peek codepointPtr
		123	chunkText <- unsafeSTToIO $ do
		124	arr <- A.unsafeFreeze dest
		125	return $! textP arr 0 (fromIntegral n)
		126	lastPtr <- peek curPtrPtr
		127	let left = lastPtr `minusPtr` curPtr
		128	return $ Some chunkText (B.drop left bs)
		129	(decodeChunk codepoint state)
		130	in loop (ptr `plusPtr` off)
		131	desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
		132
		133	ord :: Char -> Int
		134	ord (C# c#) = I# (ord# c#)
		135	{-# INLINE ord #-}
		136
		137
		138	unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
		139	unsafeWrite marr i c
		140	\| n < 0x10000 = do
		141	A.unsafeWrite marr i (fromIntegral n)
		142	return 1
		143	\| otherwise = do
		144	A.unsafeWrite marr i lo
		145	A.unsafeWrite marr (i+1) hi
		146	return 2
		147	where n = ord c
		148	m = n - 0x10000
		149	lo = fromIntegral $ (m `shiftR` 10) + 0xD800
		150	hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
		151	shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
		152	{-# INLINE unsafeWrite #-}
		153
		154	foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
		155	:: MutableByteArray# s -> Ptr CSize
		156	-> Ptr (Ptr Word8) -> Ptr Word8
		157	-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file