-{-# LANGUAGE RankNTypes, TypeFamilies #-}
+{-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-}
{-| This module provides @pipes@ utilities for \"text streams\", which are
streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
-- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's
fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()
fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
-{-# INLINABLE fromLazy #-}
+{-# INLINE fromLazy #-}
-- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
+stdin :: MonadIO m => Producer Text m ()
stdin = fromHandle IO.stdin
-{-# INLINABLE stdin #-}
+{-# INLINE stdin #-}
{-| Convert a 'IO.Handle' into a text stream using a text size
- determined by the good sense of the text library.
+ determined by the good sense of the text library; note that this
+ is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
+ but uses the system encoding and has other `Data.Text.IO` features
-}
-fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
-fromHandle h = decodeUtf8 (PB.fromHandle h)
-{-# INLINE fromHandle#-}
+fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
+fromHandle h = go where
+ go = do txt <- liftIO (T.hGetChunk h)
+ unless (T.null txt) $ do yield txt
+ go
+{-# INLINABLE fromHandle#-}
-{-| Stream text from a file using Pipes.Safe
+
+{-| Stream text from a file in the simple fashion of @Data.Text.IO@
>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
MAIN = PUTSTRLN "HELLO WORLD"
-}
-readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
+readFile :: MonadSafe m => FilePath -> Producer Text m ()
readFile file = Safe.withFile file IO.ReadMode fromHandle
-{-# INLINABLE readFile #-}
+{-# INLINE readFile #-}
{-| Stream lines of text from stdin (for testing in ghci etc.)
txt <- liftIO (T.hGetLine IO.stdin)
yield txt
go
-
+{-# INLINABLE stdinLn #-}
{-| Stream text to 'stdout'
-- | Stream text into a file. Uses @pipes-safe@.
writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
writeFile file = Safe.withFile file IO.WriteMode toHandle
+{-# INLINE writeFile #-}
-- | Apply a transformation to each 'Char' in the stream
map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r
decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
decodeUtf8 = go B.empty PE.streamDecodeUtf8 where
- go carry dec0 p = do
+ go !carry dec0 p = do
x <- lift (next p)
case x of Left r -> if B.null carry
- then return (return r) -- all input was consumed
+ then return (return r) -- all bytestrinput was consumed
else return (do yield carry -- a potentially valid fragment remains
return r)
PE.Other text bs -> do yield text
return (do yield bs -- an invalid blob remains
p')
+{-# INLINABLE decodeUtf8 #-}
+
-- | Splits a 'Producer' after the given number of characters
splitAt
module Pipes.Text.Internal
( Decoding(..)
, streamDecodeUtf8
+ , decodeSomeUtf8
) where
-
-import Control.Exception (evaluate, try)
-#if __GLASGOW_HASKELL__ >= 702
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Monad.ST (ST, runST)
-#else
-import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
-#endif
import Data.Bits ((.&.))
-import Data.ByteString as B
-import Data.ByteString.Internal as B
-import Data.Text ()
-import qualified Data.Text as T
-import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Internal (Text(..), safe, textP)
-import Data.Word (Word8, Word32)
+import Data.ByteString as B
+import Data.ByteString.Internal as B
+import qualified Data.Text as T (null)
+import Data.Text.Encoding.Error ()
+import Data.Text.Internal (Text, textP)
import Foreign.C.Types (CSize)
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
-import GHC.Base hiding (ord)
-import GHC.Word
+import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
+import GHC.Word (Word8, Word32)
import qualified Data.Text.Array as A
-import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
-import GHC.Word (Word8(..), Word16(..), Word32(..))
-
-import Data.Text.Unsafe (unsafeDupablePerformIO)
-
+
#include "pipes_text_cbits.h"
-- | A stream oriented decoding result.
showChar ' ' . showsPrec prec' bs .
showString " _"
where prec = 10; prec' = prec + 1
-
+
newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = decodeChunk B.empty 0 0
+streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
+ where
+ decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
+ decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
+ runST $ do marray <- A.new (len+1)
+ unsafeIOToST (decodeChunkToBuffer marray)
+ where
+ decodeChunkToBuffer :: A.MArray s -> IO Decoding
+ decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with codepoint0 $ \codepointPtr ->
+ with state0 $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
+ do let end = ptr `plusPtr` (off + len)
+ curPtr = ptr `plusPtr` off
+ poke curPtrPtr curPtr
+ c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+ state <- peek statePtr
+ lastPtr <- peek curPtrPtr
+ codepoint <- peek codepointPtr
+ n <- peek destOffPtr
+ chunkText <- mkText dest n
+ let left = lastPtr `minusPtr` curPtr
+ remaining = B.drop left bs
+ accum = if T.null chunkText then B.append old remaining else remaining
+ return $! case state of
+ UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
+ _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
+ {-# INLINE decodeChunkToBuffer #-}
+ {-# INLINE decodeChunkUtf8 #-}
+{-# INLINE streamDecodeUtf8 #-}
-decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
-decodeChunk old codepoint0 state0 bs@(PS fp off len) =
- runST $ do marray <- A.new (len+1)
- unsafeIOToST (decodeChunkToBuffer marray)
- where
- decodeChunkToBuffer :: A.MArray s -> IO Decoding
- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
- with (0::CSize) $ \destOffPtr ->
- with codepoint0 $ \codepointPtr ->
- with state0 $ \statePtr ->
- with nullPtr $ \curPtrPtr ->
+decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+decodeSomeUtf8 bs@(PS fp off len) = runST $ do
+ dest <- A.new (len+1)
+ unsafeIOToST $
+ withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with (0::CodePoint) $ \codepointPtr ->
+ with (0::DecoderState) $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
do let end = ptr `plusPtr` (off + len)
curPtr = ptr `plusPtr` off
poke curPtrPtr curPtr
- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+ c_decode_utf8_with_state (A.maBA dest) destOffPtr
+ curPtrPtr end codepointPtr statePtr
state <- peek statePtr
lastPtr <- peek curPtrPtr
codepoint <- peek codepointPtr
n <- peek destOffPtr
- chunkText <- mkText dest n
+ chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+ return $! textP arr 0 (fromIntegral n)
let left = lastPtr `minusPtr` curPtr
remaining = B.drop left bs
- accum = if T.null chunkText then B.append old remaining else remaining
- return $ case state of
- UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
- _ -> Some chunkText accum (decodeChunk accum codepoint state)
+ return $! (chunkText, remaining)
+{-# INLINE decodeSomeUtf8 #-}
+
+-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
+-- decodeSomeUtf8 bs@(PS fp off len) =
+-- runST $ do marray <- A.new (len+1)
+-- unsafeIOToST (decodeChunkToBuffer marray)
+--
+-- where
+-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
+-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+-- with (0::CSize) $ \destOffPtr ->
+-- with (0::CodePoint) $ \codepointPtr ->
+-- with (0::DecoderState) $ \statePtr ->
+-- with nullPtr $ \curPtrPtr ->
+-- do let end = ptr `plusPtr` (off + len)
+-- curPtr = ptr `plusPtr` off
+-- poke curPtrPtr curPtr
+-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
+-- state <- peek statePtr
+-- lastPtr <- peek curPtrPtr
+-- codepoint <- peek codepointPtr
+-- n <- peek destOffPtr
+-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
+-- return $! textP arr 0 (fromIntegral n)
+-- let left = lastPtr `minusPtr` curPtr
+-- remaining = B.drop left bs
+-- return $! (chunkText, remaining)
+-- {-# INLINE decodeChunkToBuffer #-}
+-- {-# INLINE decodeSomeUtf8 #-}
+
mkText :: A.MArray s -> CSize -> IO Text
mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
return $! textP arr 0 (fromIntegral n)
+{-# INLINE mkText #-}
ord :: Char -> Int
ord (C# c#) = I# (ord# c#)
lo = fromIntegral $ (m `shiftR` 10) + 0xD800
hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+ {-# INLINE shiftR #-}
{-# INLINE unsafeWrite #-}
foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
--- /dev/null
+import qualified Data.Text.IO as T
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.IO as TL
+import qualified Data.Text.Lazy as TL
+
+import Pipes
+import qualified Pipes.Text as TP
+import qualified Pipes.ByteString as BP
+import Pipes.Safe
+
+main = textaction
+big = "../../examples/txt/words2.txt"
+
+textaction = T.readFile big >>= T.putStrLn
+pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
+
+
+
+
+
* state0 != UTF8_ACCEPT, UTF8_REJECT
*
*/
-const uint8_t *
-_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
- const uint8_t **const src,
- const uint8_t *const srcend,
- uint32_t *codepoint0, uint32_t *state0)
+
+ #if defined(__GNUC__) || defined(__clang__)
+ static inline uint8_t const *
+ _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+ const uint8_t const **src, const uint8_t const *srcend,
+ uint32_t *codepoint0, uint32_t *state0)
+ __attribute((always_inline));
+ #endif
+
+static inline uint8_t const *
+_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+ const uint8_t const **src, const uint8_t const *srcend,
+ uint32_t *codepoint0, uint32_t *state0)
{
- uint16_t *d = dest + *destoff;
- const uint8_t *s = *src, *last = *src;
- uint32_t state = *state0;
- uint32_t codepoint = *codepoint0;
+ uint16_t *d = dest + *destoff;
+ const uint8_t *s = *src, *last = *src;
+ uint32_t state = *state0;
+ uint32_t codepoint = *codepoint0;
- while (s < srcend) {
+ while (s < srcend) {
#if defined(__i386__) || defined(__x86_64__)
- /*
- * This code will only work on a little-endian system that
- * supports unaligned loads.
- *
- * It gives a substantial speed win on data that is purely or
- * partly ASCII (e.g. HTML), at only a slight cost on purely
- * non-ASCII text.
- */
-
- if (state == UTF8_ACCEPT) {
- while (s < srcend - 4) {
+ /*
+ * This code will only work on a little-endian system that
+ * supports unaligned loads.
+ *
+ * It gives a substantial speed win on data that is purely or
+ * partly ASCII (e.g. HTML), at only a slight cost on purely
+ * non-ASCII text.
+ */
+
+ if (state == UTF8_ACCEPT) {
+ while (s < srcend - 4) {
codepoint = *((uint32_t *) s);
if ((codepoint & 0x80808080) != 0)
break;
*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
- }
- last = s;
- }
+ }
+ last = s;
+ }
#endif
- if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
- if (state != UTF8_REJECT)
+ if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+ if (state != UTF8_REJECT)
continue;
- break;
- }
-
- if (codepoint <= 0xffff)
- *d++ = (uint16_t) codepoint;
- else {
- *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
- *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
- }
- last = s;
- }
-
- /* Invalid encoding, back up to the errant character */
- if (state == UTF8_REJECT)
- s -= 1;
-
- *destoff = d - dest;
- *codepoint0 = codepoint;
- *state0 = state;
- *src = last;
-
- return s;
+ break;
+ }
+
+ if (codepoint <= 0xffff)
+ *d++ = (uint16_t) codepoint;
+ else {
+ *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+ *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+ }
+ last = s;
+ }
+
+ *destoff = d - dest;
+ *codepoint0 = codepoint;
+ *state0 = state;
+ *src = last;
+
+ return s;
+}
+
+uint8_t const *
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+ const uint8_t const **src,
+ const uint8_t const *srcend,
+ uint32_t *codepoint0, uint32_t *state0)
+{
+ uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
+ codepoint0, state0);
+ if (*state0 == UTF8_REJECT)
+ ret -=1;
+ return ret;
}
pipes-bytestring >= 1.0 && < 1.2,
transformers >= 0.3 && < 0.4,
text >=0.11 && < 0.12,
- bytestring >=0.10 && < 0.11
+ bytestring >=0.10 && < 0.11,
+ vector,
+ void
-- hs-source-dirs:
- default-language: Haskell2010
\ No newline at end of file
+ default-language: Haskell2010
+ ghc-options: -O2
main :: IO ()
main = defaultMain [tests]
-- >>> :main -a 10000
-
tests = testGroup "stream_decode" [
-- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
- testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
+ testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
+ testProperty "t_utf8_dec_some" t_utf8_dec_some]
t_utf8_incr_valid = do
Positive n <- arbitrary
chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
+--
+t_utf8_dec_some = do
+ Positive m <- arbitrary
+ txt <- genUnicode
+ let bytesLength = mod 10 m :: Int
+ forAll (vector bytesLength) $
+ (roundtrip . appendBytes txt)
+ `eq`
+ appendBytes txt
+ where
+ roundtrip bs = case PE.decodeSomeUtf8 bs of
+ (txt,bys) -> E.encodeUtf8 txt <> bys
+ appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append