filter,
scan,
encodeUtf8,
-#if MIN_VERSION_text(0,11,4)
- pipeDecodeUtf8,
- pipeDecodeUtf8With,
-#endif
pack,
unpack,
toCaseFold,
group,
lines,
words,
-#if MIN_VERSION_text(0,11,4)
decodeUtf8,
decodeUtf8With,
-#endif
-- * Transformations
intersperse,
import Pipes
import qualified Pipes.ByteString as PB
import qualified Pipes.ByteString.Parse as PBP
+import qualified Pipes.Text.Internal as PE
import Pipes.Text.Parse (
nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )
import Pipes.Core (respond, Server')
{-# INLINABLE fromLazy #-}
-- | Stream text from 'stdin'
-stdin :: MonadIO m => Producer' Text m ()
+stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
stdin = fromHandle IO.stdin
{-# INLINABLE stdin #-}
{-| Convert a 'IO.Handle' into a text stream using a text size
determined by the good sense of the text library.
-
-}
-fromHandle :: MonadIO m => IO.Handle -> Producer' Text m ()
-#if MIN_VERSION_text(0,11,4)
-fromHandle h = go TE.streamDecodeUtf8 where
+fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
+-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
+-- if only so that mistakes can be concentrated in one place.
+-- This modifies something that was faster on an earlier iteration.
+-- Note also that the `text` replacement system is being ignored;
+-- with a replacement scheme one could have `Producer Text m ()`
+-- the relation to the replacement business needs to be thought out.
+-- The complicated type seems overmuch for the toy stdin above
+fromHandle h = go PE.streamDecodeUtf8 B.empty where
act = B.hGetSome h defaultChunkSize
- go dec = do chunk <- liftIO act
- case dec chunk of
- TE.Some text _ dec' -> do yield text
- unless (B.null chunk) (go dec')
+ go dec old = do chunk <- liftIO act
+ if B.null chunk
+ then if B.null old then return (return ())
+ else return (yield old >> return ())
+ else case dec chunk of
+ PE.Some text bs dec' ->
+ if T.null text then go dec' (B.append old bs)
+ else do yield text
+ go dec' B.empty
+ PE.Other text bs ->
+ if T.null text then return (do yield old
+ yield bs
+ PB.fromHandle h)
+ else do yield text
+ return (do yield bs
+ PB.fromHandle h)
{-# INLINE fromHandle#-}
-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
-- the dedicated Text IO function 'hGetChunk' ;
-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
-- runs the same as the conduit equivalent, only slightly slower
-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
-#else
-fromHandle h = go where
- go = do txt <- liftIO (T.hGetChunk h)
- unless (T.null txt) $ do yield txt
- go
-{-# INLINABLE fromHandle#-}
-#endif
+-- #else
+-- fromHandle h = go where
+-- go = do txt <- liftIO (T.hGetChunk h)
+-- unless (T.null txt) $ do yield txt
+-- go
+-- {-# INLINABLE fromHandle#-}
+-- #endif
{-| Stream text from a file using Pipes.Safe
>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
MAIN = PUTSTRLN "HELLO WORLD"
-}
-readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m ()
+readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
readFile file = Safe.withFile file IO.ReadMode fromHandle
{-# INLINABLE readFile #-}
count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
{-# INLINABLE count #-}
-#if MIN_VERSION_text(0,11,4)
-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
-- into a Pipe of Text
decodeUtf8
:: Monad m
=> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8 = go TE.streamDecodeUtf8
- where go dec p = do
- x <- lift (next p)
- case x of
- Left r -> return (return r)
- Right (chunk, p') -> do
- let TE.Some text l dec' = dec chunk
- if B.null l
- then do
- yield text
- go dec' p'
- else return $ do
- yield l
- p'
+decodeUtf8 = decodeUtf8With Nothing
{-# INLINEABLE decodeUtf8 #-}
-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
-decodeUtf8With
+decodeUtf8With
:: Monad m
- => TE.OnDecodeError
+ => Maybe TE.OnDecodeError
-> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
-decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
- where go dec p = do
- x <- lift (next p)
- case x of
- Left r -> return (return r)
- Right (chunk, p') -> do
- let TE.Some text l dec' = dec chunk
- if B.null l
- then do
- yield text
- go dec' p'
- else return $ do
- yield l
- p'
+decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
+ go dec old p = do
+ x <- lift (next p)
+ case x of
+ Left r -> if B.null old then return (return r)
+ else return (do yield old
+ return r)
+ Right (chunk, p') ->
+ case dec chunk of
+ PE.Some text l dec' ->
+ if T.null text then go dec' (B.append old l) p'
+ else do yield text
+ go dec' B.empty p'
+ PE.Other text bs ->
+ if T.null text then return (do yield old
+ yield bs
+ p')
+ else do yield text
+ return (do yield bs
+ p')
{-# INLINEABLE decodeUtf8With #-}
--- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise
--- with any chunk that contains a sequence of bytes that is unreadable. Otherwise
--- only few bytes will only be moved from one chunk to the next before decoding.
-pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r
-pipeDecodeUtf8 = go TE.streamDecodeUtf8
- where go dec = do chunk <- await
- case dec chunk of
- TE.Some text l dec' -> do yield text
- go dec'
-{-# INLINEABLE pipeDecodeUtf8 #-}
-
--- | A simple pipe from 'ByteString' to 'Text' using a replacement function.
-pipeDecodeUtf8With
- :: Monad m
- => TE.OnDecodeError
- -> Pipe ByteString Text m r
-pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
- where go dec = do chunk <- await
- case dec chunk of
- TE.Some text l dec' -> do yield text
- go dec'
-{-# INLINEABLE pipeDecodeUtf8With #-}
-#endif
+
-- | Splits a 'Producer' after the given number of characters
splitAt
--- /dev/null
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
+ UnliftedFFITypes #-}
+-- This module lifts material from Brian O'Sullivan's text package
+-- especially Data.Text.Encoding in order to define a pipes-appropriate
+-- streamDecodeUtf8
+module Pipes.Text.Internal
+ ( Decoding(..)
+ , streamDecodeUtf8With
+ , streamDecodeUtf8
+ ) where
+
+import Control.Exception (evaluate, try)
+#if __GLASGOW_HASKELL__ >= 702
+import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
+import Control.Monad.ST (ST, runST)
+#else
+import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
+#endif
+import Data.Bits ((.&.))
+import Data.ByteString as B
+import Data.ByteString.Internal as B
+import Data.Text ()
+import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
+import Data.Text.Internal (Text(..), safe, textP)
+import Data.Word (Word8, Word32)
+import Foreign.C.Types (CSize)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
+import Foreign.Storable (Storable, peek, poke)
+import GHC.Base hiding (ord)
+import GHC.Word
+import qualified Data.Text.Array as A
+import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
+import GHC.Word (Word8(..), Word16(..), Word32(..))
+
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+
+#include "pipes_text_cbits.h"
+
+-- | A stream oriented decoding result.
+data Decoding = Some Text ByteString (ByteString -> Decoding)
+ | Other Text ByteString
+instance Show Decoding where
+ showsPrec d (Some t bs _) = showParen (d > prec) $
+ showString "Some " . showsPrec prec' t .
+ showChar ' ' . showsPrec prec' bs .
+ showString " _"
+ where prec = 10; prec' = prec + 1
+ showsPrec d (Other t bs) = showParen (d > prec) $
+ showString "Other " . showsPrec prec' t .
+ showChar ' ' . showsPrec prec' bs .
+ showString " _"
+ where prec = 10; prec' = prec + 1
+
+newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
+newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- encoded text that is known to be valid.
+--
+-- If the input contains any invalid UTF-8 data, an exception will be
+-- thrown (either by this function or a continuation) that cannot be
+-- caught in pure code. For more control over the handling of invalid
+-- data, use 'streamDecodeUtf8With'.
+streamDecodeUtf8 :: ByteString -> Decoding
+streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
+
+-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- encoded text.
+streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
+streamDecodeUtf8With mErr = case mErr of
+ Nothing -> decodeWith False strictDecode
+ Just onErr -> decodeWith True onErr
+ where
+ -- We create a slightly larger than necessary buffer to accommodate a
+ -- potential surrogate pair started in the last buffer
+ decodeWith replace onErr = decodeChunk 0 0
+ where
+ decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
+ decodeChunk codepoint0 state0 bs@(PS fp off len) =
+ runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+ where
+ decodeChunkToBuffer :: A.MArray s -> IO Decoding
+ decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+ with (0::CSize) $ \destOffPtr ->
+ with codepoint0 $ \codepointPtr ->
+ with state0 $ \statePtr ->
+ with nullPtr $ \curPtrPtr ->
+ let end = ptr `plusPtr` (off + len)
+ loop curPtr = do
+ poke curPtrPtr curPtr
+ curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+ curPtrPtr end codepointPtr statePtr
+ state <- peek statePtr
+ case state of
+ UTF8_REJECT ->
+ -- We encountered an encoding error
+ if replace
+ then do
+ x <- peek curPtr'
+ case onErr desc (Just x) of
+ Nothing -> loop $ curPtr' `plusPtr` 1
+ Just c -> do
+ destOff <- peek destOffPtr
+ w <- unsafeSTToIO $
+ unsafeWrite dest (fromIntegral destOff) (safe c)
+ poke destOffPtr (destOff + fromIntegral w)
+ poke statePtr 0
+ loop $ curPtr' `plusPtr` 1
+ else do
+ n <- peek destOffPtr
+ chunkText <- unsafeSTToIO $ do
+ arr <- A.unsafeFreeze dest
+ return $! textP arr 0 (fromIntegral n)
+ lastPtr <- peek curPtrPtr
+ let left = lastPtr `minusPtr` curPtr
+ return $ Other chunkText (B.drop left bs)
+ _ -> do
+ -- We encountered the end of the buffer while decoding
+ n <- peek destOffPtr
+ codepoint <- peek codepointPtr
+ chunkText <- unsafeSTToIO $ do
+ arr <- A.unsafeFreeze dest
+ return $! textP arr 0 (fromIntegral n)
+ lastPtr <- peek curPtrPtr
+ let left = lastPtr `minusPtr` curPtr
+ return $ Some chunkText (B.drop left bs)
+ (decodeChunk codepoint state)
+ in loop (ptr `plusPtr` off)
+ desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
+
+ord :: Char -> Int
+ord (C# c#) = I# (ord# c#)
+{-# INLINE ord #-}
+
+
+unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
+unsafeWrite marr i c
+ | n < 0x10000 = do
+ A.unsafeWrite marr i (fromIntegral n)
+ return 1
+ | otherwise = do
+ A.unsafeWrite marr i lo
+ A.unsafeWrite marr (i+1) hi
+ return 2
+ where n = ord c
+ m = n - 0x10000
+ lo = fromIntegral $ (m `shiftR` 10) + 0xD800
+ hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
+ shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
+{-# INLINE unsafeWrite #-}
+
+foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
+ :: MutableByteArray# s -> Ptr CSize
+ -> Ptr (Ptr Word8) -> Ptr Word8
+ -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
+ *
+ * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
+ *
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "pipes_text_cbits.h"
+
+
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+ /*
+ * The first part of the table maps bytes to character classes that
+ * to reduce the size of the transition table and create bitmasks.
+ */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ /*
+ * The second part is a transition table that maps a combination of
+ * a state of the automaton and a character class to a state.
+ */
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static inline uint32_t
+decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ return *state = utf8d[256 + *state + type];
+}
+
+/*
+ * A best-effort decoder. Runs until it hits either end of input or
+ * the start of an invalid byte sequence.
+ *
+ * At exit, we update *destoff with the next offset to write to, *src
+ * with the next source location past the last one successfully
+ * decoded, and return the next source location to read from.
+ *
+ * Moreover, we expose the internal decoder state (state0 and
+ * codepoint0), allowing one to restart the decoder after it
+ * terminates (say, due to a partial codepoint).
+ *
+ * In particular, there are a few possible outcomes,
+ *
+ * 1) We decoded the buffer entirely:
+ * In this case we return srcend
+ * state0 == UTF8_ACCEPT
+ *
+ * 2) We met an invalid encoding
+ * In this case we return the address of the first invalid byte
+ * state0 == UTF8_REJECT
+ *
+ * 3) We reached the end of the buffer while decoding a codepoint
+ * In this case we return a pointer to the first byte of the partial codepoint
+ * state0 != UTF8_ACCEPT, UTF8_REJECT
+ *
+ */
+const uint8_t *
+_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+ const uint8_t **const src,
+ const uint8_t *const srcend,
+ uint32_t *codepoint0, uint32_t *state0)
+{
+ uint16_t *d = dest + *destoff;
+ const uint8_t *s = *src, *last = *src;
+ uint32_t state = *state0;
+ uint32_t codepoint = *codepoint0;
+
+ while (s < srcend) {
+#if defined(__i386__) || defined(__x86_64__)
+ /*
+ * This code will only work on a little-endian system that
+ * supports unaligned loads.
+ *
+ * It gives a substantial speed win on data that is purely or
+ * partly ASCII (e.g. HTML), at only a slight cost on purely
+ * non-ASCII text.
+ */
+
+ if (state == UTF8_ACCEPT) {
+ while (s < srcend - 4) {
+ codepoint = *((uint32_t *) s);
+ if ((codepoint & 0x80808080) != 0)
+ break;
+ s += 4;
+
+ /*
+ * Tried 32-bit stores here, but the extra bit-twiddling
+ * slowed the code down.
+ */
+
+ *d++ = (uint16_t) (codepoint & 0xff);
+ *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
+ *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
+ *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
+ }
+ last = s;
+ }
+#endif
+
+ if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+ if (state != UTF8_REJECT)
+ continue;
+ break;
+ }
+
+ if (codepoint <= 0xffff)
+ *d++ = (uint16_t) codepoint;
+ else {
+ *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+ *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+ }
+ last = s;
+ }
+
+ /* Invalid encoding, back up to the errant character */
+ if (state == UTF8_REJECT)
+ s -= 1;
+
+ *destoff = d - dest;
+ *codepoint0 = codepoint;
+ *state0 = state;
+ *src = last;
+
+ return s;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
+ */
+
+#ifndef _pipes_text_cbits_h
+#define _pipes_text_cbits_h
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+#endif
cabal-version: >=1.10
library
- exposed-modules: Pipes.Text, Pipes.Text.Parse
+ c-sources: cbits/cbits.c
+ include-dirs: include
+ exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal
-- other-modules:
other-extensions: RankNTypes
build-depends: base >= 4 && < 5 ,
--- /dev/null
+import Utils
+
+import Test.QuickCheck
+import Test.QuickCheck.Monadic
+import Test.Framework (Test, testGroup, defaultMain)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+
+import Control.Exception (catch)
+import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
+import Data.Monoid (Monoid(..))
+import Data.String (fromString)
+import Data.Text.Encoding.Error
+import qualified Data.List as L
+
+import qualified Data.Bits as Bits (shiftL, shiftR)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Encoding as E
+import qualified Pipes.Text.Internal as PE
+
+main :: IO ()
+main = defaultMain [tests]
+-- >>> :main -a 10000
+
+tests = testGroup "stream_decode" [
+
+ testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
+ testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed]
+
+t_utf8_incr_valid = do
+ Positive n <- arbitrary
+ forAll genUnicode $ recode n `eq` id
+ where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8
+ feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString
+ -> [T.Text]
+ feedChunksOf n f bs
+ | B.null bs = []
+ | otherwise = let (a,b) = B.splitAt n bs
+ PE.Some t _ f' = f a
+ in case f a of
+ PE.Some t _ f' -> t : feedChunksOf n f' b
+ _ -> []
+
+t_utf8_incr_mixed = do
+ Positive n <- arbitrary
+ txt <- genUnicode
+ forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt
+ where
+ roundtrip :: [B.ByteString] -> B.ByteString
+ roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where
+ go dec acc old [] = acc <> old
+ go dec acc old (bs:bss) = case dec bs of
+ PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss
+ else go dec' (acc <> E.encodeUtf8 t) new bss
+ PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss
+ else acc <> E.encodeUtf8 t <> bs' <> B.concat bss
+ chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
+ appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
--- /dev/null
+{-#LANGUAGE ScopedTypeVariables#-}
+module Utils where
+import Control.Exception (SomeException, bracket, bracket_, evaluate, try)
+import System.IO.Unsafe (unsafePerformIO)
+import Debug.Trace (trace)
+import Data.Bits ((.&.))
+import Data.Char (chr)
+import Data.String (IsString, fromString)
+import System.Random (Random (..), RandomGen)
+import Test.QuickCheck hiding ((.&.))
+import Test.QuickCheck.Monadic (assert, monadicIO, run)
+import qualified Data.ByteString as B
+import Pipes.Text.Internal
+
+
+
+
+
+-- Ensure that two potentially bottom values (in the sense of crashing
+-- for some inputs, not looping infinitely) either both crash, or both
+-- give comparable results for some input.
+(=^=) :: (Eq a, Show a) => a -> a -> Bool
+i =^= j = unsafePerformIO $ do
+ x <- try (evaluate i)
+ y <- try (evaluate j)
+ case (x,y) of
+ (Left (_ :: SomeException), Left (_ :: SomeException))
+ -> return True
+ (Right a, Right b) -> return (a == b)
+ e -> trace ("*** Divergence: " ++ show e) return False
+infix 4 =^=
+{-# NOINLINE (=^=) #-}
+
+-- Do two functions give the same answer?
+eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool
+eq a b s = a s =^= b s
+
+-- What about with the RHS packed?
+-- eqP :: (Eq a, Show a, Stringy s) =>
+-- (String -> a) -> (s -> a) -> String -> Word8 -> Bool
+-- eqP f g s w = eql "orig" (f s) (g t) &&
+-- eql "mini" (f s) (g mini) &&
+-- eql "head" (f sa) (g ta) &&
+-- eql "tail" (f sb) (g tb)
+-- where t = packS s
+-- mini = packSChunkSize 10 s
+-- (sa,sb) = splitAt m s
+-- (ta,tb) = splitAtS m t
+-- l = length s
+-- m | l == 0 = n
+-- | otherwise = n `mod` l
+-- n = fromIntegral w
+-- eql d a b
+-- | a =^= b = True
+-- | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False
+
+
+instance Arbitrary B.ByteString where
+ arbitrary = B.pack `fmap` arbitrary
+
+genUnicode :: IsString a => Gen a
+genUnicode = fmap fromString string where
+ string = sized $ \n ->
+ do k <- choose (0,n)
+ sequence [ char | _ <- [1..k] ]
+
+ excluding :: [a -> Bool] -> Gen a -> Gen a
+ excluding bad gen = loop
+ where
+ loop = do
+ x <- gen
+ if or (map ($ x) bad)
+ then loop
+ else return x
+
+ reserved = [lowSurrogate, highSurrogate, noncharacter]
+ lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF
+ highSurrogate c = c >= 0xD800 && c <= 0xDBFF
+ noncharacter c = masked == 0xFFFE || masked == 0xFFFF
+ where
+ masked = c .&. 0xFFFF
+
+ ascii = choose (0,0x7F)
+ plane0 = choose (0xF0, 0xFFFF)
+ plane1 = oneof [ choose (0x10000, 0x10FFF)
+ , choose (0x11000, 0x11FFF)
+ , choose (0x12000, 0x12FFF)
+ , choose (0x13000, 0x13FFF)
+ , choose (0x1D000, 0x1DFFF)
+ , choose (0x1F000, 0x1FFFF)
+ ]
+ plane2 = oneof [ choose (0x20000, 0x20FFF)
+ , choose (0x21000, 0x21FFF)
+ , choose (0x22000, 0x22FFF)
+ , choose (0x23000, 0x23FFF)
+ , choose (0x24000, 0x24FFF)
+ , choose (0x25000, 0x25FFF)
+ , choose (0x26000, 0x26FFF)
+ , choose (0x27000, 0x27FFF)
+ , choose (0x28000, 0x28FFF)
+ , choose (0x29000, 0x29FFF)
+ , choose (0x2A000, 0x2AFFF)
+ , choose (0x2B000, 0x2BFFF)
+ , choose (0x2F000, 0x2FFFF)
+ ]
+ plane14 = choose (0xE0000, 0xE0FFF)
+ planes = [ascii, plane0, plane1, plane2, plane14]
+
+ char = chr `fmap` excluding reserved (oneof planes)