diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:17:25 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:17:25 -0500 |
commit | 7381c94f47c76833972565ee8d15d86216b214ce (patch) | |
tree | 38ddadda59a3808422fc432d37b886c456adcb1d | |
parent | ca6f90a05bee6471d6837d629ddaee9b0a75bd50 (diff) | |
parent | 3694350ac7b9c42fd64e0092a74cf0471a080058 (diff) | |
download | text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.gz text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.tar.zst text-pipes-7381c94f47c76833972565ee8d15d86216b214ce.zip |
merge home made decodeUtf8
-rw-r--r-- | Pipes/Text/Internal.hs | 163 | ||||
-rw-r--r-- | bench/IO.hs | 20 | ||||
-rw-r--r-- | cbits/cbits.c | 168 | ||||
-rw-r--r-- | include/pipes_text_cbits.h | 11 | ||||
-rw-r--r-- | pipes-text.cabal | 11 | ||||
-rw-r--r-- | test/Test.hs | 101 | ||||
-rw-r--r-- | test/Utils.hs | 109 |
7 files changed, 580 insertions, 3 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..7e5b044 --- /dev/null +++ b/Pipes/Text/Internal.hs | |||
@@ -0,0 +1,163 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | ||
2 | UnliftedFFITypes #-} | ||
3 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
5 | -- streamDecodeUtf8 | ||
6 | module Pipes.Text.Internal | ||
7 | ( Decoding(..) | ||
8 | , streamDecodeUtf8 | ||
9 | , decodeSomeUtf8 | ||
10 | ) where | ||
11 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
12 | import Control.Monad.ST (ST, runST) | ||
13 | import Data.Bits ((.&.)) | ||
14 | import Data.ByteString as B | ||
15 | import Data.ByteString.Internal as B | ||
16 | import qualified Data.Text as T (null) | ||
17 | import Data.Text.Encoding.Error () | ||
18 | import Data.Text.Internal (Text, textP) | ||
19 | import Foreign.C.Types (CSize) | ||
20 | import Foreign.ForeignPtr (withForeignPtr) | ||
21 | import Foreign.Marshal.Utils (with) | ||
22 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
23 | import Foreign.Storable (Storable, peek, poke) | ||
24 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
25 | import GHC.Word (Word8, Word32) | ||
26 | import qualified Data.Text.Array as A | ||
27 | |||
28 | #include "pipes_text_cbits.h" | ||
29 | |||
30 | -- | A stream oriented decoding result. | ||
31 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
32 | | Other Text ByteString | ||
33 | instance Show Decoding where | ||
34 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
35 | showString "Some " . showsPrec prec' t . | ||
36 | showChar ' ' . showsPrec prec' bs . | ||
37 | showString " _" | ||
38 | where prec = 10; prec' = prec + 1 | ||
39 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
40 | showString "Other " . showsPrec prec' t . | ||
41 | showChar ' ' . showsPrec prec' bs . | ||
42 | showString " _" | ||
43 | where prec = 10; prec' = prec + 1 | ||
44 | |||
45 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
46 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
47 | |||
48 | streamDecodeUtf8 :: ByteString -> Decoding | ||
49 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
50 | where | ||
51 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
52 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
53 | runST $ do marray <- A.new (len+1) | ||
54 | unsafeIOToST (decodeChunkToBuffer marray) | ||
55 | where | ||
56 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
57 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
58 | with (0::CSize) $ \destOffPtr -> | ||
59 | with codepoint0 $ \codepointPtr -> | ||
60 | with state0 $ \statePtr -> | ||
61 | with nullPtr $ \curPtrPtr -> | ||
62 | do let end = ptr `plusPtr` (off + len) | ||
63 | curPtr = ptr `plusPtr` off | ||
64 | poke curPtrPtr curPtr | ||
65 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
66 | state <- peek statePtr | ||
67 | lastPtr <- peek curPtrPtr | ||
68 | codepoint <- peek codepointPtr | ||
69 | n <- peek destOffPtr | ||
70 | chunkText <- mkText dest n | ||
71 | let left = lastPtr `minusPtr` curPtr | ||
72 | remaining = B.drop left bs | ||
73 | accum = if T.null chunkText then B.append old remaining else remaining | ||
74 | return $! case state of | ||
75 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
76 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
77 | {-# INLINE decodeChunkToBuffer #-} | ||
78 | {-# INLINE decodeChunkUtf8 #-} | ||
79 | {-# INLINE streamDecodeUtf8 #-} | ||
80 | |||
81 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
82 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
83 | dest <- A.new (len+1) | ||
84 | unsafeIOToST $ | ||
85 | withForeignPtr fp $ \ptr -> | ||
86 | with (0::CSize) $ \destOffPtr -> | ||
87 | with (0::CodePoint) $ \codepointPtr -> | ||
88 | with (0::DecoderState) $ \statePtr -> | ||
89 | with nullPtr $ \curPtrPtr -> | ||
90 | do let end = ptr `plusPtr` (off + len) | ||
91 | curPtr = ptr `plusPtr` off | ||
92 | poke curPtrPtr curPtr | ||
93 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
94 | curPtrPtr end codepointPtr statePtr | ||
95 | state <- peek statePtr | ||
96 | lastPtr <- peek curPtrPtr | ||
97 | codepoint <- peek codepointPtr | ||
98 | n <- peek destOffPtr | ||
99 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
100 | return $! textP arr 0 (fromIntegral n) | ||
101 | let left = lastPtr `minusPtr` curPtr | ||
102 | remaining = B.drop left bs | ||
103 | return $! (chunkText, remaining) | ||
104 | {-# INLINE decodeSomeUtf8 #-} | ||
105 | |||
106 | -- decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
107 | -- decodeSomeUtf8 bs@(PS fp off len) = | ||
108 | -- runST $ do marray <- A.new (len+1) | ||
109 | -- unsafeIOToST (decodeChunkToBuffer marray) | ||
110 | -- | ||
111 | -- where | ||
112 | -- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) | ||
113 | -- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
114 | -- with (0::CSize) $ \destOffPtr -> | ||
115 | -- with (0::CodePoint) $ \codepointPtr -> | ||
116 | -- with (0::DecoderState) $ \statePtr -> | ||
117 | -- with nullPtr $ \curPtrPtr -> | ||
118 | -- do let end = ptr `plusPtr` (off + len) | ||
119 | -- curPtr = ptr `plusPtr` off | ||
120 | -- poke curPtrPtr curPtr | ||
121 | -- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
122 | -- state <- peek statePtr | ||
123 | -- lastPtr <- peek curPtrPtr | ||
124 | -- codepoint <- peek codepointPtr | ||
125 | -- n <- peek destOffPtr | ||
126 | -- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
127 | -- return $! textP arr 0 (fromIntegral n) | ||
128 | -- let left = lastPtr `minusPtr` curPtr | ||
129 | -- remaining = B.drop left bs | ||
130 | -- return $! (chunkText, remaining) | ||
131 | -- {-# INLINE decodeChunkToBuffer #-} | ||
132 | -- {-# INLINE decodeSomeUtf8 #-} | ||
133 | |||
134 | |||
135 | |||
136 | mkText :: A.MArray s -> CSize -> IO Text | ||
137 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
138 | return $! textP arr 0 (fromIntegral n) | ||
139 | {-# INLINE mkText #-} | ||
140 | |||
141 | ord :: Char -> Int | ||
142 | ord (C# c#) = I# (ord# c#) | ||
143 | {-# INLINE ord #-} | ||
144 | |||
145 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
146 | unsafeWrite marr i c | ||
147 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
148 | return 1 | ||
149 | | otherwise = do A.unsafeWrite marr i lo | ||
150 | A.unsafeWrite marr (i+1) hi | ||
151 | return 2 | ||
152 | where n = ord c | ||
153 | m = n - 0x10000 | ||
154 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
155 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
156 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
157 | {-# INLINE shiftR #-} | ||
158 | {-# INLINE unsafeWrite #-} | ||
159 | |||
160 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
161 | :: MutableByteArray# s -> Ptr CSize | ||
162 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
163 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||
diff --git a/bench/IO.hs b/bench/IO.hs new file mode 100644 index 0000000..b3a52f6 --- /dev/null +++ b/bench/IO.hs | |||
@@ -0,0 +1,20 @@ | |||
1 | import qualified Data.Text.IO as T | ||
2 | import qualified Data.Text as T | ||
3 | import qualified Data.Text.Lazy.IO as TL | ||
4 | import qualified Data.Text.Lazy as TL | ||
5 | |||
6 | import Pipes | ||
7 | import qualified Pipes.Text as TP | ||
8 | import qualified Pipes.ByteString as BP | ||
9 | import Pipes.Safe | ||
10 | |||
11 | main = textaction | ||
12 | big = "../../examples/txt/words2.txt" | ||
13 | |||
14 | textaction = T.readFile big >>= T.putStrLn | ||
15 | pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn) | ||
16 | |||
17 | |||
18 | |||
19 | |||
20 | |||
diff --git a/cbits/cbits.c b/cbits/cbits.c new file mode 100644 index 0000000..c11645b --- /dev/null +++ b/cbits/cbits.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | * | ||
4 | * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>. | ||
5 | * | ||
6 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | ||
7 | */ | ||
8 | |||
9 | #include <string.h> | ||
10 | #include <stdint.h> | ||
11 | #include <stdio.h> | ||
12 | #include "pipes_text_cbits.h" | ||
13 | |||
14 | |||
15 | |||
16 | #define UTF8_ACCEPT 0 | ||
17 | #define UTF8_REJECT 12 | ||
18 | |||
19 | static const uint8_t utf8d[] = { | ||
20 | /* | ||
21 | * The first part of the table maps bytes to character classes that | ||
22 | * to reduce the size of the transition table and create bitmasks. | ||
23 | */ | ||
24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
26 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
27 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | ||
29 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | ||
30 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | ||
31 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | ||
32 | |||
33 | /* | ||
34 | * The second part is a transition table that maps a combination of | ||
35 | * a state of the automaton and a character class to a state. | ||
36 | */ | ||
37 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | ||
38 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | ||
39 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | ||
40 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | ||
41 | 12,36,12,12,12,12,12,12,12,12,12,12, | ||
42 | }; | ||
43 | |||
44 | static inline uint32_t | ||
45 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | ||
46 | uint32_t type = utf8d[byte]; | ||
47 | |||
48 | *codep = (*state != UTF8_ACCEPT) ? | ||
49 | (byte & 0x3fu) | (*codep << 6) : | ||
50 | (0xff >> type) & (byte); | ||
51 | |||
52 | return *state = utf8d[256 + *state + type]; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * A best-effort decoder. Runs until it hits either end of input or | ||
57 | * the start of an invalid byte sequence. | ||
58 | * | ||
59 | * At exit, we update *destoff with the next offset to write to, *src | ||
60 | * with the next source location past the last one successfully | ||
61 | * decoded, and return the next source location to read from. | ||
62 | * | ||
63 | * Moreover, we expose the internal decoder state (state0 and | ||
64 | * codepoint0), allowing one to restart the decoder after it | ||
65 | * terminates (say, due to a partial codepoint). | ||
66 | * | ||
67 | * In particular, there are a few possible outcomes, | ||
68 | * | ||
69 | * 1) We decoded the buffer entirely: | ||
70 | * In this case we return srcend | ||
71 | * state0 == UTF8_ACCEPT | ||
72 | * | ||
73 | * 2) We met an invalid encoding | ||
74 | * In this case we return the address of the first invalid byte | ||
75 | * state0 == UTF8_REJECT | ||
76 | * | ||
77 | * 3) We reached the end of the buffer while decoding a codepoint | ||
78 | * In this case we return a pointer to the first byte of the partial codepoint | ||
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | ||
80 | * | ||
81 | */ | ||
82 | |||
83 | #if defined(__GNUC__) || defined(__clang__) | ||
84 | static inline uint8_t const * | ||
85 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
86 | const uint8_t const **src, const uint8_t const *srcend, | ||
87 | uint32_t *codepoint0, uint32_t *state0) | ||
88 | __attribute((always_inline)); | ||
89 | #endif | ||
90 | |||
91 | static inline uint8_t const * | ||
92 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
93 | const uint8_t const **src, const uint8_t const *srcend, | ||
94 | uint32_t *codepoint0, uint32_t *state0) | ||
95 | { | ||
96 | uint16_t *d = dest + *destoff; | ||
97 | const uint8_t *s = *src, *last = *src; | ||
98 | uint32_t state = *state0; | ||
99 | uint32_t codepoint = *codepoint0; | ||
100 | |||
101 | while (s < srcend) { | ||
102 | #if defined(__i386__) || defined(__x86_64__) | ||
103 | /* | ||
104 | * This code will only work on a little-endian system that | ||
105 | * supports unaligned loads. | ||
106 | * | ||
107 | * It gives a substantial speed win on data that is purely or | ||
108 | * partly ASCII (e.g. HTML), at only a slight cost on purely | ||
109 | * non-ASCII text. | ||
110 | */ | ||
111 | |||
112 | if (state == UTF8_ACCEPT) { | ||
113 | while (s < srcend - 4) { | ||
114 | codepoint = *((uint32_t *) s); | ||
115 | if ((codepoint & 0x80808080) != 0) | ||
116 | break; | ||
117 | s += 4; | ||
118 | |||
119 | /* | ||
120 | * Tried 32-bit stores here, but the extra bit-twiddling | ||
121 | * slowed the code down. | ||
122 | */ | ||
123 | |||
124 | *d++ = (uint16_t) (codepoint & 0xff); | ||
125 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | ||
126 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | ||
127 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | ||
128 | } | ||
129 | last = s; | ||
130 | } | ||
131 | #endif | ||
132 | |||
133 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | ||
134 | if (state != UTF8_REJECT) | ||
135 | continue; | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | if (codepoint <= 0xffff) | ||
140 | *d++ = (uint16_t) codepoint; | ||
141 | else { | ||
142 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | ||
143 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | ||
144 | } | ||
145 | last = s; | ||
146 | } | ||
147 | |||
148 | *destoff = d - dest; | ||
149 | *codepoint0 = codepoint; | ||
150 | *state0 = state; | ||
151 | *src = last; | ||
152 | |||
153 | return s; | ||
154 | } | ||
155 | |||
156 | uint8_t const * | ||
157 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | ||
158 | const uint8_t const **src, | ||
159 | const uint8_t const *srcend, | ||
160 | uint32_t *codepoint0, uint32_t *state0) | ||
161 | { | ||
162 | uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, | ||
163 | codepoint0, state0); | ||
164 | if (*state0 == UTF8_REJECT) | ||
165 | ret -=1; | ||
166 | return ret; | ||
167 | } | ||
168 | |||
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h new file mode 100644 index 0000000..b9ab670 --- /dev/null +++ b/include/pipes_text_cbits.h | |||
@@ -0,0 +1,11 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | */ | ||
4 | |||
5 | #ifndef _pipes_text_cbits_h | ||
6 | #define _pipes_text_cbits_h | ||
7 | |||
8 | #define UTF8_ACCEPT 0 | ||
9 | #define UTF8_REJECT 12 | ||
10 | |||
11 | #endif | ||
diff --git a/pipes-text.cabal b/pipes-text.cabal index e79f168..b4388be 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -12,7 +12,9 @@ build-type: Simple | |||
12 | cabal-version: >=1.10 | 12 | cabal-version: >=1.10 |
13 | 13 | ||
14 | library | 14 | library |
15 | exposed-modules: Pipes.Text, Pipes.Text.Parse | 15 | c-sources: cbits/cbits.c |
16 | include-dirs: include | ||
17 | exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal | ||
16 | -- other-modules: | 18 | -- other-modules: |
17 | other-extensions: RankNTypes | 19 | other-extensions: RankNTypes |
18 | build-depends: base >= 4 && < 5 , | 20 | build-depends: base >= 4 && < 5 , |
@@ -23,6 +25,9 @@ library | |||
23 | pipes-bytestring >= 1.0 && < 1.2, | 25 | pipes-bytestring >= 1.0 && < 1.2, |
24 | transformers >= 0.3 && < 0.4, | 26 | transformers >= 0.3 && < 0.4, |
25 | text >=0.11 && < 0.12, | 27 | text >=0.11 && < 0.12, |
26 | bytestring >=0.10 && < 0.11 | 28 | bytestring >=0.10 && < 0.11, |
29 | vector, | ||
30 | void | ||
27 | -- hs-source-dirs: | 31 | -- hs-source-dirs: |
28 | default-language: Haskell2010 \ No newline at end of file | 32 | default-language: Haskell2010 |
33 | ghc-options: -O2 | ||
diff --git a/test/Test.hs b/test/Test.hs new file mode 100644 index 0000000..373bafb --- /dev/null +++ b/test/Test.hs | |||
@@ -0,0 +1,101 @@ | |||
1 | import Utils | ||
2 | |||
3 | import Test.QuickCheck | ||
4 | import Test.QuickCheck.Monadic | ||
5 | import Test.Framework (Test, testGroup, defaultMain) | ||
6 | import Test.Framework.Providers.QuickCheck2 (testProperty) | ||
7 | |||
8 | import Control.Exception (catch) | ||
9 | import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord) | ||
10 | import Data.Monoid (Monoid(..)) | ||
11 | import Control.Monad | ||
12 | import Data.String (fromString) | ||
13 | import Data.Text.Encoding.Error | ||
14 | import qualified Data.List as L | ||
15 | |||
16 | import qualified Data.Bits as Bits (shiftL, shiftR) | ||
17 | import qualified Data.ByteString as B | ||
18 | import qualified Data.ByteString.Lazy as BL | ||
19 | import qualified Data.Text as T | ||
20 | import qualified Data.Text.Lazy as TL | ||
21 | import qualified Data.Text.Encoding as E | ||
22 | import qualified Pipes.Text.Internal as PE | ||
23 | import qualified Pipes.Text as TP | ||
24 | import qualified Pipes.ByteString as BP | ||
25 | import qualified Pipes as P | ||
26 | |||
27 | main :: IO () | ||
28 | main = defaultMain [tests] | ||
29 | -- >>> :main -a 10000 | ||
30 | tests = testGroup "stream_decode" [ | ||
31 | -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, | ||
32 | testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed , | ||
33 | testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe, | ||
34 | testProperty "t_utf8_dec_some" t_utf8_dec_some] | ||
35 | |||
36 | t_utf8_incr_valid = do | ||
37 | Positive n <- arbitrary | ||
38 | forAll genUnicode $ recode n `eq` id | ||
39 | where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8 | ||
40 | feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString | ||
41 | -> [T.Text] | ||
42 | feedChunksOf n f bs | ||
43 | | B.null bs = [] | ||
44 | | otherwise = let (a,b) = B.splitAt n bs | ||
45 | PE.Some t _ f' = f a | ||
46 | in case f a of | ||
47 | PE.Some t _ f' -> t : feedChunksOf n f' b | ||
48 | _ -> [] | ||
49 | |||
50 | t_utf8_incr_mixed = do | ||
51 | Positive n <- arbitrary | ||
52 | txt <- genUnicode | ||
53 | let chunkSize = mod n 7 + 1 | ||
54 | forAll (vector 9) $ | ||
55 | (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt) | ||
56 | where | ||
57 | roundtrip :: [B.ByteString] -> B.ByteString | ||
58 | roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where | ||
59 | go dec acc [] = acc | ||
60 | go dec acc [bs] = case dec bs of | ||
61 | PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l | ||
62 | PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' | ||
63 | go dec acc (bs:bss) = case dec bs of | ||
64 | PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss | ||
65 | PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss | ||
66 | chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b | ||
67 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | ||
68 | |||
69 | t_utf8_incr_pipe = do | ||
70 | Positive m <- arbitrary | ||
71 | Positive n <- arbitrary | ||
72 | txt <- genUnicode | ||
73 | let chunkSize = mod n 7 + 1 | ||
74 | bytesLength = mod 10 m | ||
75 | forAll (vector bytesLength) $ | ||
76 | (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt) | ||
77 | `eq` | ||
78 | appendBytes txt | ||
79 | where | ||
80 | roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r | ||
81 | roundtrip p = join (TP.decodeUtf8 p P.>-> TP.encodeUtf8) | ||
82 | chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b | ||
83 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | ||
84 | |||
85 | -- | ||
86 | t_utf8_dec_some = do | ||
87 | Positive m <- arbitrary | ||
88 | txt <- genUnicode | ||
89 | let bytesLength = mod 10 m :: Int | ||
90 | forAll (vector bytesLength) $ | ||
91 | (roundtrip . appendBytes txt) | ||
92 | `eq` | ||
93 | appendBytes txt | ||
94 | where | ||
95 | roundtrip bs = case PE.decodeSomeUtf8 bs of | ||
96 | (txt,bys) -> E.encodeUtf8 txt <> bys | ||
97 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | ||
98 | |||
99 | |||
100 | |||
101 | |||
diff --git a/test/Utils.hs b/test/Utils.hs new file mode 100644 index 0000000..75cd1db --- /dev/null +++ b/test/Utils.hs | |||
@@ -0,0 +1,109 @@ | |||
1 | {-#LANGUAGE ScopedTypeVariables#-} | ||
2 | module Utils where | ||
3 | import Control.Exception (SomeException, bracket, bracket_, evaluate, try) | ||
4 | import System.IO.Unsafe (unsafePerformIO) | ||
5 | import Debug.Trace (trace) | ||
6 | import Data.Bits ((.&.)) | ||
7 | import Data.Char (chr) | ||
8 | import Data.String (IsString, fromString) | ||
9 | import System.Random (Random (..), RandomGen) | ||
10 | import Test.QuickCheck hiding ((.&.)) | ||
11 | import Test.QuickCheck.Monadic (assert, monadicIO, run) | ||
12 | import qualified Data.ByteString as B | ||
13 | import Pipes.Text.Internal | ||
14 | |||
15 | |||
16 | |||
17 | |||
18 | |||
19 | -- Ensure that two potentially bottom values (in the sense of crashing | ||
20 | -- for some inputs, not looping infinitely) either both crash, or both | ||
21 | -- give comparable results for some input. | ||
22 | (=^=) :: (Eq a, Show a) => a -> a -> Bool | ||
23 | i =^= j = unsafePerformIO $ do | ||
24 | x <- try (evaluate i) | ||
25 | y <- try (evaluate j) | ||
26 | case (x,y) of | ||
27 | (Left (_ :: SomeException), Left (_ :: SomeException)) | ||
28 | -> return True | ||
29 | (Right a, Right b) -> return (a == b) | ||
30 | e -> trace ("*** Divergence: " ++ show e) return False | ||
31 | infix 4 =^= | ||
32 | {-# NOINLINE (=^=) #-} | ||
33 | |||
34 | -- Do two functions give the same answer? | ||
35 | eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool | ||
36 | eq a b s = a s =^= b s | ||
37 | |||
38 | -- What about with the RHS packed? | ||
39 | -- eqP :: (Eq a, Show a, Stringy s) => | ||
40 | -- (String -> a) -> (s -> a) -> String -> Word8 -> Bool | ||
41 | -- eqP f g s w = eql "orig" (f s) (g t) && | ||
42 | -- eql "mini" (f s) (g mini) && | ||
43 | -- eql "head" (f sa) (g ta) && | ||
44 | -- eql "tail" (f sb) (g tb) | ||
45 | -- where t = packS s | ||
46 | -- mini = packSChunkSize 10 s | ||
47 | -- (sa,sb) = splitAt m s | ||
48 | -- (ta,tb) = splitAtS m t | ||
49 | -- l = length s | ||
50 | -- m | l == 0 = n | ||
51 | -- | otherwise = n `mod` l | ||
52 | -- n = fromIntegral w | ||
53 | -- eql d a b | ||
54 | -- | a =^= b = True | ||
55 | -- | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False | ||
56 | |||
57 | |||
58 | instance Arbitrary B.ByteString where | ||
59 | arbitrary = B.pack `fmap` arbitrary | ||
60 | |||
61 | genUnicode :: IsString a => Gen a | ||
62 | genUnicode = fmap fromString string where | ||
63 | string = sized $ \n -> | ||
64 | do k <- choose (0,n) | ||
65 | sequence [ char | _ <- [1..k] ] | ||
66 | |||
67 | excluding :: [a -> Bool] -> Gen a -> Gen a | ||
68 | excluding bad gen = loop | ||
69 | where | ||
70 | loop = do | ||
71 | x <- gen | ||
72 | if or (map ($ x) bad) | ||
73 | then loop | ||
74 | else return x | ||
75 | |||
76 | reserved = [lowSurrogate, highSurrogate, noncharacter] | ||
77 | lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF | ||
78 | highSurrogate c = c >= 0xD800 && c <= 0xDBFF | ||
79 | noncharacter c = masked == 0xFFFE || masked == 0xFFFF | ||
80 | where | ||
81 | masked = c .&. 0xFFFF | ||
82 | |||
83 | ascii = choose (0,0x7F) | ||
84 | plane0 = choose (0xF0, 0xFFFF) | ||
85 | plane1 = oneof [ choose (0x10000, 0x10FFF) | ||
86 | , choose (0x11000, 0x11FFF) | ||
87 | , choose (0x12000, 0x12FFF) | ||
88 | , choose (0x13000, 0x13FFF) | ||
89 | , choose (0x1D000, 0x1DFFF) | ||
90 | , choose (0x1F000, 0x1FFFF) | ||
91 | ] | ||
92 | plane2 = oneof [ choose (0x20000, 0x20FFF) | ||
93 | , choose (0x21000, 0x21FFF) | ||
94 | , choose (0x22000, 0x22FFF) | ||
95 | , choose (0x23000, 0x23FFF) | ||
96 | , choose (0x24000, 0x24FFF) | ||
97 | , choose (0x25000, 0x25FFF) | ||
98 | , choose (0x26000, 0x26FFF) | ||
99 | , choose (0x27000, 0x27FFF) | ||
100 | , choose (0x28000, 0x28FFF) | ||
101 | , choose (0x29000, 0x29FFF) | ||
102 | , choose (0x2A000, 0x2AFFF) | ||
103 | , choose (0x2B000, 0x2BFFF) | ||
104 | , choose (0x2F000, 0x2FFFF) | ||
105 | ] | ||
106 | plane14 = choose (0xE0000, 0xE0FFF) | ||
107 | planes = [ascii, plane0, plane1, plane2, plane14] | ||
108 | |||
109 | char = chr `fmap` excluding reserved (oneof planes) | ||