aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2014-01-14 22:17:25 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2014-01-14 22:17:25 -0500
commit7381c94f47c76833972565ee8d15d86216b214ce (patch)
tree38ddadda59a3808422fc432d37b886c456adcb1d
parentca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff)
parent3694350ac7b9c42fd64e0092a74cf0471a080058 (diff)
downloadtext-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst
text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip
merge home made decodeUtf8
-rw-r--r--Pipes/Text/Internal.hs163
-rw-r--r--bench/IO.hs20
-rw-r--r--cbits/cbits.c168
-rw-r--r--include/pipes_text_cbits.h11
-rw-r--r--pipes-text.cabal11
-rw-r--r--test/Test.hs101
-rw-r--r--test/Utils.hs109
7 files changed, 580 insertions, 3 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
new file mode 100644
index 0000000..7e5b044
--- /dev/null
+++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,163 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3-- This module lifts assorted materials from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8
6module Pipes.Text.Internal
7 ( Decoding(..)
8 , streamDecodeUtf8
9 , decodeSomeUtf8
10 ) where
11import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
12import Control.Monad.ST (ST, runST)
13import Data.Bits ((.&.))
14import Data.ByteString as B
15import Data.ByteString.Internal as B
16import qualified Data.Text as T (null)
17import Data.Text.Encoding.Error ()
18import Data.Text.Internal (Text, textP)
19import Foreign.C.Types (CSize)
20import Foreign.ForeignPtr (withForeignPtr)
21import Foreign.Marshal.Utils (with)
22import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
23import Foreign.Storable (Storable, peek, poke)
24import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
25import GHC.Word (Word8, Word32)
26import qualified Data.Text.Array as A
27
28#include "pipes_text_cbits.h"
29
30-- | A stream oriented decoding result.
31data Decoding = Some Text ByteString (ByteString -> Decoding)
32 | Other Text ByteString
33instance Show Decoding where
34 showsPrec d (Some t bs _) = showParen (d > prec) $
35 showString "Some " . showsPrec prec' t .
36 showChar ' ' . showsPrec prec' bs .
37 showString " _"
38 where prec = 10; prec' = prec + 1
39 showsPrec d (Other t bs) = showParen (d > prec) $
40 showString "Other " . showsPrec prec' t .
41 showChar ' ' . showsPrec prec' bs .
42 showString " _"
43 where prec = 10; prec' = prec + 1
44
45newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
46newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
47
48streamDecodeUtf8 :: ByteString -> Decoding
49streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
50 where
51 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
52 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
53 runST $ do marray <- A.new (len+1)
54 unsafeIOToST (decodeChunkToBuffer marray)
55 where
56 decodeChunkToBuffer :: A.MArray s -> IO Decoding
57 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
58 with (0::CSize) $ \destOffPtr ->
59 with codepoint0 $ \codepointPtr ->
60 with state0 $ \statePtr ->
61 with nullPtr $ \curPtrPtr ->
62 do let end = ptr `plusPtr` (off + len)
63 curPtr = ptr `plusPtr` off
64 poke curPtrPtr curPtr
65 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
66 state <- peek statePtr
67 lastPtr <- peek curPtrPtr
68 codepoint <- peek codepointPtr
69 n <- peek destOffPtr
70 chunkText <- mkText dest n
71 let left = lastPtr `minusPtr` curPtr
72 remaining = B.drop left bs
73 accum = if T.null chunkText then B.append old remaining else remaining
74 return $! case state of
75 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
76 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
77 {-# INLINE decodeChunkToBuffer #-}
78 {-# INLINE decodeChunkUtf8 #-}
79{-# INLINE streamDecodeUtf8 #-}
80
81decodeSomeUtf8 :: ByteString -> (Text, ByteString)
82decodeSomeUtf8 bs@(PS fp off len) = runST $ do
83 dest <- A.new (len+1)
84 unsafeIOToST $
85 withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr ->
87 with (0::CodePoint) $ \codepointPtr ->
88 with (0::DecoderState) $ \statePtr ->
89 with nullPtr $ \curPtrPtr ->
90 do let end = ptr `plusPtr` (off + len)
91 curPtr = ptr `plusPtr` off
92 poke curPtrPtr curPtr
93 c_decode_utf8_with_state (A.maBA dest) destOffPtr
94 curPtrPtr end codepointPtr statePtr
95 state <- peek statePtr
96 lastPtr <- peek curPtrPtr
97 codepoint <- peek codepointPtr
98 n <- peek destOffPtr
99 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
100 return $! textP arr 0 (fromIntegral n)
101 let left = lastPtr `minusPtr` curPtr
102 remaining = B.drop left bs
103 return $! (chunkText, remaining)
104{-# INLINE decodeSomeUtf8 #-}
105
106-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
107-- decodeSomeUtf8 bs@(PS fp off len) =
108-- runST $ do marray <- A.new (len+1)
109-- unsafeIOToST (decodeChunkToBuffer marray)
110--
111-- where
112-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
113-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
114-- with (0::CSize) $ \destOffPtr ->
115-- with (0::CodePoint) $ \codepointPtr ->
116-- with (0::DecoderState) $ \statePtr ->
117-- with nullPtr $ \curPtrPtr ->
118-- do let end = ptr `plusPtr` (off + len)
119-- curPtr = ptr `plusPtr` off
120-- poke curPtrPtr curPtr
121-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
122-- state <- peek statePtr
123-- lastPtr <- peek curPtrPtr
124-- codepoint <- peek codepointPtr
125-- n <- peek destOffPtr
126-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
127-- return $! textP arr 0 (fromIntegral n)
128-- let left = lastPtr `minusPtr` curPtr
129-- remaining = B.drop left bs
130-- return $! (chunkText, remaining)
131-- {-# INLINE decodeChunkToBuffer #-}
132-- {-# INLINE decodeSomeUtf8 #-}
133
134
135
136mkText :: A.MArray s -> CSize -> IO Text
137mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
138 return $! textP arr 0 (fromIntegral n)
139{-# INLINE mkText #-}
140
141ord :: Char -> Int
142ord (C# c#) = I# (ord# c#)
143{-# INLINE ord #-}
144
145unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
146unsafeWrite marr i c
147 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
148 return 1
149 | otherwise = do A.unsafeWrite marr i lo
150 A.unsafeWrite marr (i+1) hi
151 return 2
152 where n = ord c
153 m = n - 0x10000
154 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
155 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
156 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
157 {-# INLINE shiftR #-}
158{-# INLINE unsafeWrite #-}
159
160foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
161 :: MutableByteArray# s -> Ptr CSize
162 -> Ptr (Ptr Word8) -> Ptr Word8
163 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file
diff --git a/bench/IO.hs b/bench/IO.hs
new file mode 100644
index 0000000..b3a52f6
--- /dev/null
+++ b/bench/IO.hs
@@ -0,0 +1,20 @@
1import qualified Data.Text.IO as T
2import qualified Data.Text as T
3import qualified Data.Text.Lazy.IO as TL
4import qualified Data.Text.Lazy as TL
5
6import Pipes
7import qualified Pipes.Text as TP
8import qualified Pipes.ByteString as BP
9import Pipes.Safe
10
11main = textaction
12big = "../../examples/txt/words2.txt"
13
14textaction = T.readFile big >>= T.putStrLn
15pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn)
16
17
18
19
20
diff --git a/cbits/cbits.c b/cbits/cbits.c
new file mode 100644
index 0000000..c11645b
--- /dev/null
+++ b/cbits/cbits.c
@@ -0,0 +1,168 @@
1/*
2 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
3 *
4 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
5 *
6 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7 */
8
9#include <string.h>
10#include <stdint.h>
11#include <stdio.h>
12#include "pipes_text_cbits.h"
13
14
15
16#define UTF8_ACCEPT 0
17#define UTF8_REJECT 12
18
19static const uint8_t utf8d[] = {
20 /*
21 * The first part of the table maps bytes to character classes that
22 * to reduce the size of the transition table and create bitmasks.
23 */
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
29 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
30 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
32
33 /*
34 * The second part is a transition table that maps a combination of
35 * a state of the automaton and a character class to a state.
36 */
37 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
38 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
39 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
40 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
41 12,36,12,12,12,12,12,12,12,12,12,12,
42};
43
44static inline uint32_t
45decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
46 uint32_t type = utf8d[byte];
47
48 *codep = (*state != UTF8_ACCEPT) ?
49 (byte & 0x3fu) | (*codep << 6) :
50 (0xff >> type) & (byte);
51
52 return *state = utf8d[256 + *state + type];
53}
54
55/*
56 * A best-effort decoder. Runs until it hits either end of input or
57 * the start of an invalid byte sequence.
58 *
59 * At exit, we update *destoff with the next offset to write to, *src
60 * with the next source location past the last one successfully
61 * decoded, and return the next source location to read from.
62 *
63 * Moreover, we expose the internal decoder state (state0 and
64 * codepoint0), allowing one to restart the decoder after it
65 * terminates (say, due to a partial codepoint).
66 *
67 * In particular, there are a few possible outcomes,
68 *
69 * 1) We decoded the buffer entirely:
70 * In this case we return srcend
71 * state0 == UTF8_ACCEPT
72 *
73 * 2) We met an invalid encoding
74 * In this case we return the address of the first invalid byte
75 * state0 == UTF8_REJECT
76 *
77 * 3) We reached the end of the buffer while decoding a codepoint
78 * In this case we return a pointer to the first byte of the partial codepoint
79 * state0 != UTF8_ACCEPT, UTF8_REJECT
80 *
81 */
82
83 #if defined(__GNUC__) || defined(__clang__)
84 static inline uint8_t const *
85 _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
86 const uint8_t const **src, const uint8_t const *srcend,
87 uint32_t *codepoint0, uint32_t *state0)
88 __attribute((always_inline));
89 #endif
90
91static inline uint8_t const *
92_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
93 const uint8_t const **src, const uint8_t const *srcend,
94 uint32_t *codepoint0, uint32_t *state0)
95{
96 uint16_t *d = dest + *destoff;
97 const uint8_t *s = *src, *last = *src;
98 uint32_t state = *state0;
99 uint32_t codepoint = *codepoint0;
100
101 while (s < srcend) {
102#if defined(__i386__) || defined(__x86_64__)
103 /*
104 * This code will only work on a little-endian system that
105 * supports unaligned loads.
106 *
107 * It gives a substantial speed win on data that is purely or
108 * partly ASCII (e.g. HTML), at only a slight cost on purely
109 * non-ASCII text.
110 */
111
112 if (state == UTF8_ACCEPT) {
113 while (s < srcend - 4) {
114 codepoint = *((uint32_t *) s);
115 if ((codepoint & 0x80808080) != 0)
116 break;
117 s += 4;
118
119 /*
120 * Tried 32-bit stores here, but the extra bit-twiddling
121 * slowed the code down.
122 */
123
124 *d++ = (uint16_t) (codepoint & 0xff);
125 *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
126 *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
127 *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
128 }
129 last = s;
130 }
131#endif
132
133 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
134 if (state != UTF8_REJECT)
135 continue;
136 break;
137 }
138
139 if (codepoint <= 0xffff)
140 *d++ = (uint16_t) codepoint;
141 else {
142 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
143 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
144 }
145 last = s;
146 }
147
148 *destoff = d - dest;
149 *codepoint0 = codepoint;
150 *state0 = state;
151 *src = last;
152
153 return s;
154}
155
156uint8_t const *
157_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
158 const uint8_t const **src,
159 const uint8_t const *srcend,
160 uint32_t *codepoint0, uint32_t *state0)
161{
162 uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
163 codepoint0, state0);
164 if (*state0 == UTF8_REJECT)
165 ret -=1;
166 return ret;
167}
168
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h
new file mode 100644
index 0000000..b9ab670
--- /dev/null
+++ b/include/pipes_text_cbits.h
@@ -0,0 +1,11 @@
1/*
2 * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
3 */
4
5#ifndef _pipes_text_cbits_h
6#define _pipes_text_cbits_h
7
8#define UTF8_ACCEPT 0
9#define UTF8_REJECT 12
10
11#endif
diff --git a/pipes-text.cabal b/pipes-text.cabal
index e79f168..b4388be 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -12,7 +12,9 @@ build-type: Simple
12cabal-version: >=1.10 12cabal-version: >=1.10
13 13
14library 14library
15 exposed-modules: Pipes.Text, Pipes.Text.Parse 15 c-sources: cbits/cbits.c
16 include-dirs: include
17 exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal
16 -- other-modules: 18 -- other-modules:
17 other-extensions: RankNTypes 19 other-extensions: RankNTypes
18 build-depends: base >= 4 && < 5 , 20 build-depends: base >= 4 && < 5 ,
@@ -23,6 +25,9 @@ library
23 pipes-bytestring >= 1.0 && < 1.2, 25 pipes-bytestring >= 1.0 && < 1.2,
24 transformers >= 0.3 && < 0.4, 26 transformers >= 0.3 && < 0.4,
25 text >=0.11 && < 0.12, 27 text >=0.11 && < 0.12,
26 bytestring >=0.10 && < 0.11 28 bytestring >=0.10 && < 0.11,
29 vector,
30 void
27 -- hs-source-dirs: 31 -- hs-source-dirs:
28 default-language: Haskell2010 \ No newline at end of file 32 default-language: Haskell2010
33 ghc-options: -O2
diff --git a/test/Test.hs b/test/Test.hs
new file mode 100644
index 0000000..373bafb
--- /dev/null
+++ b/test/Test.hs
@@ -0,0 +1,101 @@
1import Utils
2
3import Test.QuickCheck
4import Test.QuickCheck.Monadic
5import Test.Framework (Test, testGroup, defaultMain)
6import Test.Framework.Providers.QuickCheck2 (testProperty)
7
8import Control.Exception (catch)
9import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
10import Data.Monoid (Monoid(..))
11import Control.Monad
12import Data.String (fromString)
13import Data.Text.Encoding.Error
14import qualified Data.List as L
15
16import qualified Data.Bits as Bits (shiftL, shiftR)
17import qualified Data.ByteString as B
18import qualified Data.ByteString.Lazy as BL
19import qualified Data.Text as T
20import qualified Data.Text.Lazy as TL
21import qualified Data.Text.Encoding as E
22import qualified Pipes.Text.Internal as PE
23import qualified Pipes.Text as TP
24import qualified Pipes.ByteString as BP
25import qualified Pipes as P
26
27main :: IO ()
28main = defaultMain [tests]
29-- >>> :main -a 10000
30tests = testGroup "stream_decode" [
31 -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
32 testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed ,
33 testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe,
34 testProperty "t_utf8_dec_some" t_utf8_dec_some]
35
36t_utf8_incr_valid = do
37 Positive n <- arbitrary
38 forAll genUnicode $ recode n `eq` id
39 where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8
40 feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString
41 -> [T.Text]
42 feedChunksOf n f bs
43 | B.null bs = []
44 | otherwise = let (a,b) = B.splitAt n bs
45 PE.Some t _ f' = f a
46 in case f a of
47 PE.Some t _ f' -> t : feedChunksOf n f' b
48 _ -> []
49
50t_utf8_incr_mixed = do
51 Positive n <- arbitrary
52 txt <- genUnicode
53 let chunkSize = mod n 7 + 1
54 forAll (vector 9) $
55 (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
56 where
57 roundtrip :: [B.ByteString] -> B.ByteString
58 roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where
59 go dec acc [] = acc
60 go dec acc [bs] = case dec bs of
61 PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
62 PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs'
63 go dec acc (bs:bss) = case dec bs of
64 PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
65 PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
66 chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
67 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
68
69t_utf8_incr_pipe = do
70 Positive m <- arbitrary
71 Positive n <- arbitrary
72 txt <- genUnicode
73 let chunkSize = mod n 7 + 1
74 bytesLength = mod 10 m
75 forAll (vector bytesLength) $
76 (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt)
77 `eq`
78 appendBytes txt
79 where
80 roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
81 roundtrip p = join (TP.decodeUtf8 p P.>-> TP.encodeUtf8)
82 chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
83 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
84
85--
86t_utf8_dec_some = do
87 Positive m <- arbitrary
88 txt <- genUnicode
89 let bytesLength = mod 10 m :: Int
90 forAll (vector bytesLength) $
91 (roundtrip . appendBytes txt)
92 `eq`
93 appendBytes txt
94 where
95 roundtrip bs = case PE.decodeSomeUtf8 bs of
96 (txt,bys) -> E.encodeUtf8 txt <> bys
97 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
98
99
100
101
diff --git a/test/Utils.hs b/test/Utils.hs
new file mode 100644
index 0000000..75cd1db
--- /dev/null
+++ b/test/Utils.hs
@@ -0,0 +1,109 @@
1{-#LANGUAGE ScopedTypeVariables#-}
2module Utils where
3import Control.Exception (SomeException, bracket, bracket_, evaluate, try)
4import System.IO.Unsafe (unsafePerformIO)
5import Debug.Trace (trace)
6import Data.Bits ((.&.))
7import Data.Char (chr)
8import Data.String (IsString, fromString)
9import System.Random (Random (..), RandomGen)
10import Test.QuickCheck hiding ((.&.))
11import Test.QuickCheck.Monadic (assert, monadicIO, run)
12import qualified Data.ByteString as B
13import Pipes.Text.Internal
14
15
16
17
18
19-- Ensure that two potentially bottom values (in the sense of crashing
20-- for some inputs, not looping infinitely) either both crash, or both
21-- give comparable results for some input.
22(=^=) :: (Eq a, Show a) => a -> a -> Bool
23i =^= j = unsafePerformIO $ do
24 x <- try (evaluate i)
25 y <- try (evaluate j)
26 case (x,y) of
27 (Left (_ :: SomeException), Left (_ :: SomeException))
28 -> return True
29 (Right a, Right b) -> return (a == b)
30 e -> trace ("*** Divergence: " ++ show e) return False
31infix 4 =^=
32{-# NOINLINE (=^=) #-}
33
34-- Do two functions give the same answer?
35eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool
36eq a b s = a s =^= b s
37
38-- What about with the RHS packed?
39-- eqP :: (Eq a, Show a, Stringy s) =>
40-- (String -> a) -> (s -> a) -> String -> Word8 -> Bool
41-- eqP f g s w = eql "orig" (f s) (g t) &&
42-- eql "mini" (f s) (g mini) &&
43-- eql "head" (f sa) (g ta) &&
44-- eql "tail" (f sb) (g tb)
45-- where t = packS s
46-- mini = packSChunkSize 10 s
47-- (sa,sb) = splitAt m s
48-- (ta,tb) = splitAtS m t
49-- l = length s
50-- m | l == 0 = n
51-- | otherwise = n `mod` l
52-- n = fromIntegral w
53-- eql d a b
54-- | a =^= b = True
55-- | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False
56
57
58instance Arbitrary B.ByteString where
59 arbitrary = B.pack `fmap` arbitrary
60
61genUnicode :: IsString a => Gen a
62genUnicode = fmap fromString string where
63 string = sized $ \n ->
64 do k <- choose (0,n)
65 sequence [ char | _ <- [1..k] ]
66
67 excluding :: [a -> Bool] -> Gen a -> Gen a
68 excluding bad gen = loop
69 where
70 loop = do
71 x <- gen
72 if or (map ($ x) bad)
73 then loop
74 else return x
75
76 reserved = [lowSurrogate, highSurrogate, noncharacter]
77 lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF
78 highSurrogate c = c >= 0xD800 && c <= 0xDBFF
79 noncharacter c = masked == 0xFFFE || masked == 0xFFFF
80 where
81 masked = c .&. 0xFFFF
82
83 ascii = choose (0,0x7F)
84 plane0 = choose (0xF0, 0xFFFF)
85 plane1 = oneof [ choose (0x10000, 0x10FFF)
86 , choose (0x11000, 0x11FFF)
87 , choose (0x12000, 0x12FFF)
88 , choose (0x13000, 0x13FFF)
89 , choose (0x1D000, 0x1DFFF)
90 , choose (0x1F000, 0x1FFFF)
91 ]
92 plane2 = oneof [ choose (0x20000, 0x20FFF)
93 , choose (0x21000, 0x21FFF)
94 , choose (0x22000, 0x22FFF)
95 , choose (0x23000, 0x23FFF)
96 , choose (0x24000, 0x24FFF)
97 , choose (0x25000, 0x25FFF)
98 , choose (0x26000, 0x26FFF)
99 , choose (0x27000, 0x27FFF)
100 , choose (0x28000, 0x28FFF)
101 , choose (0x29000, 0x29FFF)
102 , choose (0x2A000, 0x2AFFF)
103 , choose (0x2B000, 0x2BFFF)
104 , choose (0x2F000, 0x2FFFF)
105 ]
106 plane14 = choose (0xE0000, 0xE0FFF)
107 planes = [ascii, plane0, plane1, plane2, plane14]
108
109 char = chr `fmap` excluding reserved (oneof planes)