From: michaelt Date: Mon, 23 Dec 2013 18:02:49 +0000 (-0500) Subject: variant using text internals in place of text streamDecodeUtf8 X-Git-Url: https://git.immae.eu/?a=commitdiff_plain;h=8c48280926efffc0ca52a5d9ca796d639d053379;hp=8853a440e37523bae8cb46827d0d2d356bad5c46;p=github%2Ffretlink%2Ftext-pipes.git variant using text internals in place of text streamDecodeUtf8 --- diff --git a/Pipes/Text.hs b/Pipes/Text.hs index a5859a3..6845dd3 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs @@ -81,10 +81,6 @@ module Pipes.Text ( filter, scan, encodeUtf8, -#if MIN_VERSION_text(0,11,4) - pipeDecodeUtf8, - pipeDecodeUtf8With, -#endif pack, unpack, toCaseFold, @@ -119,10 +115,8 @@ module Pipes.Text ( group, lines, words, -#if MIN_VERSION_text(0,11,4) decodeUtf8, decodeUtf8With, -#endif -- * Transformations intersperse, @@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G import Pipes import qualified Pipes.ByteString as PB import qualified Pipes.ByteString.Parse as PBP +import qualified Pipes.Text.Internal as PE import Pipes.Text.Parse ( nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) import Pipes.Core (respond, Server') @@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) {-# INLINABLE fromLazy #-} -- | Stream text from 'stdin' -stdin :: MonadIO m => Producer' Text m () +stdin :: MonadIO m => Producer' Text m (Producer ByteString m ()) stdin = fromHandle IO.stdin {-# INLINABLE stdin #-} {-| Convert a 'IO.Handle' into a text stream using a text size determined by the good sense of the text library. - -} -fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () -#if MIN_VERSION_text(0,11,4) -fromHandle h = go TE.streamDecodeUtf8 where +fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ()) +-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)` +-- if only so that mistakes can be concentrated in one place. +-- This modifies something that was faster on an earlier iteration. +-- Note also that the `text` replacement system is being ignored; +-- with a replacement scheme one could have `Producer Text m ()` +-- the relation to the replacement business needs to be thought out. +-- The complicated type seems overmuch for the toy stdin above +fromHandle h = go PE.streamDecodeUtf8 B.empty where act = B.hGetSome h defaultChunkSize - go dec = do chunk <- liftIO act - case dec chunk of - TE.Some text _ dec' -> do yield text - unless (B.null chunk) (go dec') + go dec old = do chunk <- liftIO act + if B.null chunk + then if B.null old then return (return ()) + else return (yield old >> return ()) + else case dec chunk of + PE.Some text bs dec' -> + if T.null text then go dec' (B.append old bs) + else do yield text + go dec' B.empty + PE.Other text bs -> + if T.null text then return (do yield old + yield bs + PB.fromHandle h) + else do yield text + return (do yield bs + PB.fromHandle h) {-# INLINE fromHandle#-} -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as -- the dedicated Text IO function 'hGetChunk' ; -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" -- runs the same as the conduit equivalent, only slightly slower -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" -#else -fromHandle h = go where - go = do txt <- liftIO (T.hGetChunk h) - unless (T.null txt) $ do yield txt - go -{-# INLINABLE fromHandle#-} -#endif +-- #else +-- fromHandle h = go where +-- go = do txt <- liftIO (T.hGetChunk h) +-- unless (T.null txt) $ do yield txt +-- go +-- {-# INLINABLE fromHandle#-} +-- #endif {-| Stream text from a file using Pipes.Safe >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout MAIN = PUTSTRLN "HELLO WORLD" -} -readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () +readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ()) readFile file = Safe.withFile file IO.ReadMode fromHandle {-# INLINABLE readFile #-} @@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) {-# INLINABLE count #-} -#if MIN_VERSION_text(0,11,4) -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded -- into a Pipe of Text decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8 = go TE.streamDecodeUtf8 - where go dec p = do - x <- lift (next p) - case x of - Left r -> return (return r) - Right (chunk, p') -> do - let TE.Some text l dec' = dec chunk - if B.null l - then do - yield text - go dec' p' - else return $ do - yield l - p' +decodeUtf8 = decodeUtf8With Nothing {-# INLINEABLE decodeUtf8 #-} -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" -decodeUtf8With +decodeUtf8With :: Monad m - => TE.OnDecodeError + => Maybe TE.OnDecodeError -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) -decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) - where go dec p = do - x <- lift (next p) - case x of - Left r -> return (return r) - Right (chunk, p') -> do - let TE.Some text l dec' = dec chunk - if B.null l - then do - yield text - go dec' p' - else return $ do - yield l - p' +decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where + go dec old p = do + x <- lift (next p) + case x of + Left r -> if B.null old then return (return r) + else return (do yield old + return r) + Right (chunk, p') -> + case dec chunk of + PE.Some text l dec' -> + if T.null text then go dec' (B.append old l) p' + else do yield text + go dec' B.empty p' + PE.Other text bs -> + if T.null text then return (do yield old + yield bs + p') + else do yield text + return (do yield bs + p') {-# INLINEABLE decodeUtf8With #-} --- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise --- with any chunk that contains a sequence of bytes that is unreadable. Otherwise --- only few bytes will only be moved from one chunk to the next before decoding. -pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r -pipeDecodeUtf8 = go TE.streamDecodeUtf8 - where go dec = do chunk <- await - case dec chunk of - TE.Some text l dec' -> do yield text - go dec' -{-# INLINEABLE pipeDecodeUtf8 #-} - --- | A simple pipe from 'ByteString' to 'Text' using a replacement function. -pipeDecodeUtf8With - :: Monad m - => TE.OnDecodeError - -> Pipe ByteString Text m r -pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) - where go dec = do chunk <- await - case dec chunk of - TE.Some text l dec' -> do yield text - go dec' -{-# INLINEABLE pipeDecodeUtf8With #-} -#endif + -- | Splits a 'Producer' after the given number of characters splitAt diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..05d9887 --- /dev/null +++ b/Pipes/Text/Internal.hs @@ -0,0 +1,157 @@ +{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, + UnliftedFFITypes #-} +-- This module lifts material from Brian O'Sullivan's text package +-- especially Data.Text.Encoding in order to define a pipes-appropriate +-- streamDecodeUtf8 +module Pipes.Text.Internal + ( Decoding(..) + , streamDecodeUtf8With + , streamDecodeUtf8 + ) where + +import Control.Exception (evaluate, try) +#if __GLASGOW_HASKELL__ >= 702 +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Control.Monad.ST (ST, runST) +#else +import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) +#endif +import Data.Bits ((.&.)) +import Data.ByteString as B +import Data.ByteString.Internal as B +import Data.Text () +import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) +import Data.Text.Internal (Text(..), safe, textP) +import Data.Word (Word8, Word32) +import Foreign.C.Types (CSize) +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Marshal.Utils (with) +import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) +import Foreign.Storable (Storable, peek, poke) +import GHC.Base hiding (ord) +import GHC.Word +import qualified Data.Text.Array as A +import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) +import GHC.Word (Word8(..), Word16(..), Word32(..)) + +import Data.Text.Unsafe (unsafeDupablePerformIO) + +#include "pipes_text_cbits.h" + +-- | A stream oriented decoding result. +data Decoding = Some Text ByteString (ByteString -> Decoding) + | Other Text ByteString +instance Show Decoding where + showsPrec d (Some t bs _) = showParen (d > prec) $ + showString "Some " . showsPrec prec' t . + showChar ' ' . showsPrec prec' bs . + showString " _" + where prec = 10; prec' = prec + 1 + showsPrec d (Other t bs) = showParen (d > prec) $ + showString "Other " . showsPrec prec' t . + showChar ' ' . showsPrec prec' bs . + showString " _" + where prec = 10; prec' = prec + 1 + +newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) +newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 +-- encoded text that is known to be valid. +-- +-- If the input contains any invalid UTF-8 data, an exception will be +-- thrown (either by this function or a continuation) that cannot be +-- caught in pure code. For more control over the handling of invalid +-- data, use 'streamDecodeUtf8With'. +streamDecodeUtf8 :: ByteString -> Decoding +streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 +-- encoded text. +streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding +streamDecodeUtf8With mErr = case mErr of + Nothing -> decodeWith False strictDecode + Just onErr -> decodeWith True onErr + where + -- We create a slightly larger than necessary buffer to accommodate a + -- potential surrogate pair started in the last buffer + decodeWith replace onErr = decodeChunk 0 0 + where + decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding + decodeChunk codepoint0 state0 bs@(PS fp off len) = + runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) + where + decodeChunkToBuffer :: A.MArray s -> IO Decoding + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + with nullPtr $ \curPtrPtr -> + let end = ptr `plusPtr` (off + len) + loop curPtr = do + poke curPtrPtr curPtr + curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr + curPtrPtr end codepointPtr statePtr + state <- peek statePtr + case state of + UTF8_REJECT -> + -- We encountered an encoding error + if replace + then do + x <- peek curPtr' + case onErr desc (Just x) of + Nothing -> loop $ curPtr' `plusPtr` 1 + Just c -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) (safe c) + poke destOffPtr (destOff + fromIntegral w) + poke statePtr 0 + loop $ curPtr' `plusPtr` 1 + else do + n <- peek destOffPtr + chunkText <- unsafeSTToIO $ do + arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) + lastPtr <- peek curPtrPtr + let left = lastPtr `minusPtr` curPtr + return $ Other chunkText (B.drop left bs) + _ -> do + -- We encountered the end of the buffer while decoding + n <- peek destOffPtr + codepoint <- peek codepointPtr + chunkText <- unsafeSTToIO $ do + arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) + lastPtr <- peek curPtrPtr + let left = lastPtr `minusPtr` curPtr + return $ Some chunkText (B.drop left bs) + (decodeChunk codepoint state) + in loop (ptr `plusPtr` off) + desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" + +ord :: Char -> Int +ord (C# c#) = I# (ord# c#) +{-# INLINE ord #-} + + +unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int +unsafeWrite marr i c + | n < 0x10000 = do + A.unsafeWrite marr i (fromIntegral n) + return 1 + | otherwise = do + A.unsafeWrite marr i lo + A.unsafeWrite marr (i+1) hi + return 2 + where n = ord c + m = n - 0x10000 + lo = fromIntegral $ (m `shiftR` 10) + 0xD800 + hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 + shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) +{-# INLINE unsafeWrite #-} + +foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state + :: MutableByteArray# s -> Ptr CSize + -> Ptr (Ptr Word8) -> Ptr Word8 + -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file diff --git a/cbits/cbits.c b/cbits/cbits.c new file mode 100644 index 0000000..e0fdfd5 --- /dev/null +++ b/cbits/cbits.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2011 Bryan O'Sullivan . + * + * Portions copyright (c) 2008-2010 Björn Höhrmann . + * + * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + */ + +#include +#include +#include +#include "pipes_text_cbits.h" + + + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const uint8_t utf8d[] = { + /* + * The first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * The second part is a transition table that maps a combination of + * a state of the automaton and a character class to a state. + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static inline uint32_t +decode(uint32_t *state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + return *state = utf8d[256 + *state + type]; +} + +/* + * A best-effort decoder. Runs until it hits either end of input or + * the start of an invalid byte sequence. + * + * At exit, we update *destoff with the next offset to write to, *src + * with the next source location past the last one successfully + * decoded, and return the next source location to read from. + * + * Moreover, we expose the internal decoder state (state0 and + * codepoint0), allowing one to restart the decoder after it + * terminates (say, due to a partial codepoint). + * + * In particular, there are a few possible outcomes, + * + * 1) We decoded the buffer entirely: + * In this case we return srcend + * state0 == UTF8_ACCEPT + * + * 2) We met an invalid encoding + * In this case we return the address of the first invalid byte + * state0 == UTF8_REJECT + * + * 3) We reached the end of the buffer while decoding a codepoint + * In this case we return a pointer to the first byte of the partial codepoint + * state0 != UTF8_ACCEPT, UTF8_REJECT + * + */ +const uint8_t * +_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, + const uint8_t **const src, + const uint8_t *const srcend, + uint32_t *codepoint0, uint32_t *state0) +{ + uint16_t *d = dest + *destoff; + const uint8_t *s = *src, *last = *src; + uint32_t state = *state0; + uint32_t codepoint = *codepoint0; + + while (s < srcend) { +#if defined(__i386__) || defined(__x86_64__) + /* + * This code will only work on a little-endian system that + * supports unaligned loads. + * + * It gives a substantial speed win on data that is purely or + * partly ASCII (e.g. HTML), at only a slight cost on purely + * non-ASCII text. + */ + + if (state == UTF8_ACCEPT) { + while (s < srcend - 4) { + codepoint = *((uint32_t *) s); + if ((codepoint & 0x80808080) != 0) + break; + s += 4; + + /* + * Tried 32-bit stores here, but the extra bit-twiddling + * slowed the code down. + */ + + *d++ = (uint16_t) (codepoint & 0xff); + *d++ = (uint16_t) ((codepoint >> 8) & 0xff); + *d++ = (uint16_t) ((codepoint >> 16) & 0xff); + *d++ = (uint16_t) ((codepoint >> 24) & 0xff); + } + last = s; + } +#endif + + if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { + if (state != UTF8_REJECT) + continue; + break; + } + + if (codepoint <= 0xffff) + *d++ = (uint16_t) codepoint; + else { + *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); + *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); + } + last = s; + } + + /* Invalid encoding, back up to the errant character */ + if (state == UTF8_REJECT) + s -= 1; + + *destoff = d - dest; + *codepoint0 = codepoint; + *state0 = state; + *src = last; + + return s; +} + diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h new file mode 100644 index 0000000..b9ab670 --- /dev/null +++ b/include/pipes_text_cbits.h @@ -0,0 +1,11 @@ +/* + * Copyright (c) 2013 Bryan O'Sullivan . + */ + +#ifndef _pipes_text_cbits_h +#define _pipes_text_cbits_h + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +#endif diff --git a/pipes-text.cabal b/pipes-text.cabal index e79f168..86fbab8 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal @@ -12,7 +12,9 @@ build-type: Simple cabal-version: >=1.10 library - exposed-modules: Pipes.Text, Pipes.Text.Parse + c-sources: cbits/cbits.c + include-dirs: include + exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal -- other-modules: other-extensions: RankNTypes build-depends: base >= 4 && < 5 , diff --git a/test/Test.hs b/test/Test.hs new file mode 100644 index 0000000..1579f2b --- /dev/null +++ b/test/Test.hs @@ -0,0 +1,60 @@ +import Utils + +import Test.QuickCheck +import Test.QuickCheck.Monadic +import Test.Framework (Test, testGroup, defaultMain) +import Test.Framework.Providers.QuickCheck2 (testProperty) + +import Control.Exception (catch) +import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord) +import Data.Monoid (Monoid(..)) +import Data.String (fromString) +import Data.Text.Encoding.Error +import qualified Data.List as L + +import qualified Data.Bits as Bits (shiftL, shiftR) +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Encoding as E +import qualified Pipes.Text.Internal as PE + +main :: IO () +main = defaultMain [tests] +-- >>> :main -a 10000 + +tests = testGroup "stream_decode" [ + + testProperty "t_utf8_incr_valid" t_utf8_incr_valid, + testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed] + +t_utf8_incr_valid = do + Positive n <- arbitrary + forAll genUnicode $ recode n `eq` id + where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8 + feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString + -> [T.Text] + feedChunksOf n f bs + | B.null bs = [] + | otherwise = let (a,b) = B.splitAt n bs + PE.Some t _ f' = f a + in case f a of + PE.Some t _ f' -> t : feedChunksOf n f' b + _ -> [] + +t_utf8_incr_mixed = do + Positive n <- arbitrary + txt <- genUnicode + forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt + where + roundtrip :: [B.ByteString] -> B.ByteString + roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where + go dec acc old [] = acc <> old + go dec acc old (bs:bss) = case dec bs of + PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss + else go dec' (acc <> E.encodeUtf8 t) new bss + PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss + else acc <> E.encodeUtf8 t <> bs' <> B.concat bss + chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b + appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append diff --git a/test/Utils.hs b/test/Utils.hs new file mode 100644 index 0000000..75cd1db --- /dev/null +++ b/test/Utils.hs @@ -0,0 +1,109 @@ +{-#LANGUAGE ScopedTypeVariables#-} +module Utils where +import Control.Exception (SomeException, bracket, bracket_, evaluate, try) +import System.IO.Unsafe (unsafePerformIO) +import Debug.Trace (trace) +import Data.Bits ((.&.)) +import Data.Char (chr) +import Data.String (IsString, fromString) +import System.Random (Random (..), RandomGen) +import Test.QuickCheck hiding ((.&.)) +import Test.QuickCheck.Monadic (assert, monadicIO, run) +import qualified Data.ByteString as B +import Pipes.Text.Internal + + + + + +-- Ensure that two potentially bottom values (in the sense of crashing +-- for some inputs, not looping infinitely) either both crash, or both +-- give comparable results for some input. +(=^=) :: (Eq a, Show a) => a -> a -> Bool +i =^= j = unsafePerformIO $ do + x <- try (evaluate i) + y <- try (evaluate j) + case (x,y) of + (Left (_ :: SomeException), Left (_ :: SomeException)) + -> return True + (Right a, Right b) -> return (a == b) + e -> trace ("*** Divergence: " ++ show e) return False +infix 4 =^= +{-# NOINLINE (=^=) #-} + +-- Do two functions give the same answer? +eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool +eq a b s = a s =^= b s + +-- What about with the RHS packed? +-- eqP :: (Eq a, Show a, Stringy s) => +-- (String -> a) -> (s -> a) -> String -> Word8 -> Bool +-- eqP f g s w = eql "orig" (f s) (g t) && +-- eql "mini" (f s) (g mini) && +-- eql "head" (f sa) (g ta) && +-- eql "tail" (f sb) (g tb) +-- where t = packS s +-- mini = packSChunkSize 10 s +-- (sa,sb) = splitAt m s +-- (ta,tb) = splitAtS m t +-- l = length s +-- m | l == 0 = n +-- | otherwise = n `mod` l +-- n = fromIntegral w +-- eql d a b +-- | a =^= b = True +-- | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False + + +instance Arbitrary B.ByteString where + arbitrary = B.pack `fmap` arbitrary + +genUnicode :: IsString a => Gen a +genUnicode = fmap fromString string where + string = sized $ \n -> + do k <- choose (0,n) + sequence [ char | _ <- [1..k] ] + + excluding :: [a -> Bool] -> Gen a -> Gen a + excluding bad gen = loop + where + loop = do + x <- gen + if or (map ($ x) bad) + then loop + else return x + + reserved = [lowSurrogate, highSurrogate, noncharacter] + lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF + highSurrogate c = c >= 0xD800 && c <= 0xDBFF + noncharacter c = masked == 0xFFFE || masked == 0xFFFF + where + masked = c .&. 0xFFFF + + ascii = choose (0,0x7F) + plane0 = choose (0xF0, 0xFFFF) + plane1 = oneof [ choose (0x10000, 0x10FFF) + , choose (0x11000, 0x11FFF) + , choose (0x12000, 0x12FFF) + , choose (0x13000, 0x13FFF) + , choose (0x1D000, 0x1DFFF) + , choose (0x1F000, 0x1FFFF) + ] + plane2 = oneof [ choose (0x20000, 0x20FFF) + , choose (0x21000, 0x21FFF) + , choose (0x22000, 0x22FFF) + , choose (0x23000, 0x23FFF) + , choose (0x24000, 0x24FFF) + , choose (0x25000, 0x25FFF) + , choose (0x26000, 0x26FFF) + , choose (0x27000, 0x27FFF) + , choose (0x28000, 0x28FFF) + , choose (0x29000, 0x29FFF) + , choose (0x2A000, 0x2AFFF) + , choose (0x2B000, 0x2BFFF) + , choose (0x2F000, 0x2FFFF) + ] + plane14 = choose (0xE0000, 0xE0FFF) + planes = [ascii, plane0, plane1, plane2, plane14] + + char = chr `fmap` excluding reserved (oneof planes)