aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-23 13:02:49 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-23 13:02:49 -0500
commit8c48280926efffc0ca52a5d9ca796d639d053379 (patch)
tree972ca8955b5581d634663424e973e56fa4487fe5
parent8853a440e37523bae8cb46827d0d2d356bad5c46 (diff)
downloadtext-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz
text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst
text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip
variant using text internals in place of text streamDecodeUtf8
-rw-r--r--Pipes/Text.hs136
-rw-r--r--Pipes/Text/Internal.hs157
-rw-r--r--cbits/cbits.c151
-rw-r--r--include/pipes_text_cbits.h11
-rw-r--r--pipes-text.cabal4
-rw-r--r--test/Test.hs60
-rw-r--r--test/Utils.hs109
7 files changed, 550 insertions, 78 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index a5859a3..6845dd3 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -81,10 +81,6 @@ module Pipes.Text (
81 filter, 81 filter,
82 scan, 82 scan,
83 encodeUtf8, 83 encodeUtf8,
84#if MIN_VERSION_text(0,11,4)
85 pipeDecodeUtf8,
86 pipeDecodeUtf8With,
87#endif
88 pack, 84 pack,
89 unpack, 85 unpack,
90 toCaseFold, 86 toCaseFold,
@@ -119,10 +115,8 @@ module Pipes.Text (
119 group, 115 group,
120 lines, 116 lines,
121 words, 117 words,
122#if MIN_VERSION_text(0,11,4)
123 decodeUtf8, 118 decodeUtf8,
124 decodeUtf8With, 119 decodeUtf8With,
125#endif
126 -- * Transformations 120 -- * Transformations
127 intersperse, 121 intersperse,
128 122
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G
167import Pipes 161import Pipes
168import qualified Pipes.ByteString as PB 162import qualified Pipes.ByteString as PB
169import qualified Pipes.ByteString.Parse as PBP 163import qualified Pipes.ByteString.Parse as PBP
164import qualified Pipes.Text.Internal as PE
170import Pipes.Text.Parse ( 165import Pipes.Text.Parse (
171 nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) 166 nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )
172import Pipes.Core (respond, Server') 167import Pipes.Core (respond, Server')
@@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
214{-# INLINABLE fromLazy #-} 209{-# INLINABLE fromLazy #-}
215 210
216-- | Stream text from 'stdin' 211-- | Stream text from 'stdin'
217stdin :: MonadIO m => Producer' Text m () 212stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
218stdin = fromHandle IO.stdin 213stdin = fromHandle IO.stdin
219{-# INLINABLE stdin #-} 214{-# INLINABLE stdin #-}
220 215
221{-| Convert a 'IO.Handle' into a text stream using a text size 216{-| Convert a 'IO.Handle' into a text stream using a text size
222 determined by the good sense of the text library. 217 determined by the good sense of the text library.
223
224-} 218-}
225 219
226fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () 220fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
227#if MIN_VERSION_text(0,11,4) 221-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
228fromHandle h = go TE.streamDecodeUtf8 where 222-- if only so that mistakes can be concentrated in one place.
223-- This modifies something that was faster on an earlier iteration.
224-- Note also that the `text` replacement system is being ignored;
225-- with a replacement scheme one could have `Producer Text m ()`
226-- the relation to the replacement business needs to be thought out.
227-- The complicated type seems overmuch for the toy stdin above
228fromHandle h = go PE.streamDecodeUtf8 B.empty where
229 act = B.hGetSome h defaultChunkSize 229 act = B.hGetSome h defaultChunkSize
230 go dec = do chunk <- liftIO act 230 go dec old = do chunk <- liftIO act
231 case dec chunk of 231 if B.null chunk
232 TE.Some text _ dec' -> do yield text 232 then if B.null old then return (return ())
233 unless (B.null chunk) (go dec') 233 else return (yield old >> return ())
234 else case dec chunk of
235 PE.Some text bs dec' ->
236 if T.null text then go dec' (B.append old bs)
237 else do yield text
238 go dec' B.empty
239 PE.Other text bs ->
240 if T.null text then return (do yield old
241 yield bs
242 PB.fromHandle h)
243 else do yield text
244 return (do yield bs
245 PB.fromHandle h)
234{-# INLINE fromHandle#-} 246{-# INLINE fromHandle#-}
235-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as 247-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
236-- the dedicated Text IO function 'hGetChunk' ; 248-- the dedicated Text IO function 'hGetChunk' ;
237-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" 249-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
238-- runs the same as the conduit equivalent, only slightly slower 250-- runs the same as the conduit equivalent, only slightly slower
239-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" 251-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
240#else 252-- #else
241fromHandle h = go where 253-- fromHandle h = go where
242 go = do txt <- liftIO (T.hGetChunk h) 254-- go = do txt <- liftIO (T.hGetChunk h)
243 unless (T.null txt) $ do yield txt 255-- unless (T.null txt) $ do yield txt
244 go 256-- go
245{-# INLINABLE fromHandle#-} 257-- {-# INLINABLE fromHandle#-}
246#endif 258-- #endif
247{-| Stream text from a file using Pipes.Safe 259{-| Stream text from a file using Pipes.Safe
248 260
249>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout 261>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
250MAIN = PUTSTRLN "HELLO WORLD" 262MAIN = PUTSTRLN "HELLO WORLD"
251-} 263-}
252 264
253readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () 265readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
254readFile file = Safe.withFile file IO.ReadMode fromHandle 266readFile file = Safe.withFile file IO.ReadMode fromHandle
255{-# INLINABLE readFile #-} 267{-# INLINABLE readFile #-}
256 268
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n
610count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) 622count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
611{-# INLINABLE count #-} 623{-# INLINABLE count #-}
612 624
613#if MIN_VERSION_text(0,11,4)
614-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded 625-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
615-- into a Pipe of Text 626-- into a Pipe of Text
616decodeUtf8 627decodeUtf8
617 :: Monad m 628 :: Monad m
618 => Producer ByteString m r -> Producer Text m (Producer ByteString m r) 629 => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
619decodeUtf8 = go TE.streamDecodeUtf8 630decodeUtf8 = decodeUtf8With Nothing
620 where go dec p = do
621 x <- lift (next p)
622 case x of
623 Left r -> return (return r)
624 Right (chunk, p') -> do
625 let TE.Some text l dec' = dec chunk
626 if B.null l
627 then do
628 yield text
629 go dec' p'
630 else return $ do
631 yield l
632 p'
633{-# INLINEABLE decodeUtf8 #-} 631{-# INLINEABLE decodeUtf8 #-}
634 632
635-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded 633-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
636-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ 634-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
637-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" 635-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
638decodeUtf8With 636decodeUtf8With
639 :: Monad m 637 :: Monad m
640 => TE.OnDecodeError 638 => Maybe TE.OnDecodeError
641 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) 639 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
642decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) 640decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
643 where go dec p = do 641 go dec old p = do
644 x <- lift (next p) 642 x <- lift (next p)
645 case x of 643 case x of
646 Left r -> return (return r) 644 Left r -> if B.null old then return (return r)
647 Right (chunk, p') -> do 645 else return (do yield old
648 let TE.Some text l dec' = dec chunk 646 return r)
649 if B.null l 647 Right (chunk, p') ->
650 then do 648 case dec chunk of
651 yield text 649 PE.Some text l dec' ->
652 go dec' p' 650 if T.null text then go dec' (B.append old l) p'
653 else return $ do 651 else do yield text
654 yield l 652 go dec' B.empty p'
655 p' 653 PE.Other text bs ->
654 if T.null text then return (do yield old
655 yield bs
656 p')
657 else do yield text
658 return (do yield bs
659 p')
656{-# INLINEABLE decodeUtf8With #-} 660{-# INLINEABLE decodeUtf8With #-}
657 661
658-- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise 662
659-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise
660-- only few bytes will only be moved from one chunk to the next before decoding.
661pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r
662pipeDecodeUtf8 = go TE.streamDecodeUtf8
663 where go dec = do chunk <- await
664 case dec chunk of
665 TE.Some text l dec' -> do yield text
666 go dec'
667{-# INLINEABLE pipeDecodeUtf8 #-}
668
669-- | A simple pipe from 'ByteString' to 'Text' using a replacement function.
670pipeDecodeUtf8With
671 :: Monad m
672 => TE.OnDecodeError
673 -> Pipe ByteString Text m r
674pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
675 where go dec = do chunk <- await
676 case dec chunk of
677 TE.Some text l dec' -> do yield text
678 go dec'
679{-# INLINEABLE pipeDecodeUtf8With #-}
680#endif
681 663
682-- | Splits a 'Producer' after the given number of characters 664-- | Splits a 'Producer' after the given number of characters
683splitAt 665splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
new file mode 100644
index 0000000..05d9887
--- /dev/null
+++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,157 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3-- This module lifts material from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8
6module Pipes.Text.Internal
7 ( Decoding(..)
8 , streamDecodeUtf8With
9 , streamDecodeUtf8
10 ) where
11
12import Control.Exception (evaluate, try)
13#if __GLASGOW_HASKELL__ >= 702
14import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15import Control.Monad.ST (ST, runST)
16#else
17import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
18#endif
19import Data.Bits ((.&.))
20import Data.ByteString as B
21import Data.ByteString.Internal as B
22import Data.Text ()
23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24import Data.Text.Internal (Text(..), safe, textP)
25import Data.Word (Word8, Word32)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base hiding (ord)
32import GHC.Word
33import qualified Data.Text.Array as A
34import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39#include "pipes_text_cbits.h"
40
41-- | A stream oriented decoding result.
42data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
48 showString " _"
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55
56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58
59-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60-- encoded text that is known to be valid.
61--
62-- If the input contains any invalid UTF-8 data, an exception will be
63-- thrown (either by this function or a continuation) that cannot be
64-- caught in pure code. For more control over the handling of invalid
65-- data, use 'streamDecodeUtf8With'.
66streamDecodeUtf8 :: ByteString -> Decoding
67streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
68
69-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
70-- encoded text.
71streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
72streamDecodeUtf8With mErr = case mErr of
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
75 where
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
79 where
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83 where
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len)
91 loop curPtr = do
92 poke curPtrPtr curPtr
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
94 curPtrPtr end codepointPtr statePtr
95 state <- peek statePtr
96 case state of
97 UTF8_REJECT ->
98 -- We encountered an encoding error
99 if replace
100 then do
101 x <- peek curPtr'
102 case onErr desc (Just x) of
103 Nothing -> loop $ curPtr' `plusPtr` 1
104 Just c -> do
105 destOff <- peek destOffPtr
106 w <- unsafeSTToIO $
107 unsafeWrite dest (fromIntegral destOff) (safe c)
108 poke destOffPtr (destOff + fromIntegral w)
109 poke statePtr 0
110 loop $ curPtr' `plusPtr` 1
111 else do
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
119 _ -> do
120 -- We encountered the end of the buffer while decoding
121 n <- peek destOffPtr
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132
133ord :: Char -> Int
134ord (C# c#) = I# (ord# c#)
135{-# INLINE ord #-}
136
137
138unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139unsafeWrite marr i c
140 | n < 0x10000 = do
141 A.unsafeWrite marr i (fromIntegral n)
142 return 1
143 | otherwise = do
144 A.unsafeWrite marr i lo
145 A.unsafeWrite marr (i+1) hi
146 return 2
147 where n = ord c
148 m = n - 0x10000
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
150 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
151 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
152{-# INLINE unsafeWrite #-}
153
154foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
155 :: MutableByteArray# s -> Ptr CSize
156 -> Ptr (Ptr Word8) -> Ptr Word8
157 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file
diff --git a/cbits/cbits.c b/cbits/cbits.c
new file mode 100644
index 0000000..e0fdfd5
--- /dev/null
+++ b/cbits/cbits.c
@@ -0,0 +1,151 @@
1/*
2 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
3 *
4 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
5 *
6 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7 */
8
9#include <string.h>
10#include <stdint.h>
11#include <stdio.h>
12#include "pipes_text_cbits.h"
13
14
15
16#define UTF8_ACCEPT 0
17#define UTF8_REJECT 12
18
19static const uint8_t utf8d[] = {
20 /*
21 * The first part of the table maps bytes to character classes that
22 * to reduce the size of the transition table and create bitmasks.
23 */
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
29 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
30 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
32
33 /*
34 * The second part is a transition table that maps a combination of
35 * a state of the automaton and a character class to a state.
36 */
37 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
38 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
39 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
40 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
41 12,36,12,12,12,12,12,12,12,12,12,12,
42};
43
44static inline uint32_t
45decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
46 uint32_t type = utf8d[byte];
47
48 *codep = (*state != UTF8_ACCEPT) ?
49 (byte & 0x3fu) | (*codep << 6) :
50 (0xff >> type) & (byte);
51
52 return *state = utf8d[256 + *state + type];
53}
54
55/*
56 * A best-effort decoder. Runs until it hits either end of input or
57 * the start of an invalid byte sequence.
58 *
59 * At exit, we update *destoff with the next offset to write to, *src
60 * with the next source location past the last one successfully
61 * decoded, and return the next source location to read from.
62 *
63 * Moreover, we expose the internal decoder state (state0 and
64 * codepoint0), allowing one to restart the decoder after it
65 * terminates (say, due to a partial codepoint).
66 *
67 * In particular, there are a few possible outcomes,
68 *
69 * 1) We decoded the buffer entirely:
70 * In this case we return srcend
71 * state0 == UTF8_ACCEPT
72 *
73 * 2) We met an invalid encoding
74 * In this case we return the address of the first invalid byte
75 * state0 == UTF8_REJECT
76 *
77 * 3) We reached the end of the buffer while decoding a codepoint
78 * In this case we return a pointer to the first byte of the partial codepoint
79 * state0 != UTF8_ACCEPT, UTF8_REJECT
80 *
81 */
82const uint8_t *
83_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
84 const uint8_t **const src,
85 const uint8_t *const srcend,
86 uint32_t *codepoint0, uint32_t *state0)
87{
88 uint16_t *d = dest + *destoff;
89 const uint8_t *s = *src, *last = *src;
90 uint32_t state = *state0;
91 uint32_t codepoint = *codepoint0;
92
93 while (s < srcend) {
94#if defined(__i386__) || defined(__x86_64__)
95 /*
96 * This code will only work on a little-endian system that
97 * supports unaligned loads.
98 *
99 * It gives a substantial speed win on data that is purely or
100 * partly ASCII (e.g. HTML), at only a slight cost on purely
101 * non-ASCII text.
102 */
103
104 if (state == UTF8_ACCEPT) {
105 while (s < srcend - 4) {
106 codepoint = *((uint32_t *) s);
107 if ((codepoint & 0x80808080) != 0)
108 break;
109 s += 4;
110
111 /*
112 * Tried 32-bit stores here, but the extra bit-twiddling
113 * slowed the code down.
114 */
115
116 *d++ = (uint16_t) (codepoint & 0xff);
117 *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
118 *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
119 *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
120 }
121 last = s;
122 }
123#endif
124
125 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
126 if (state != UTF8_REJECT)
127 continue;
128 break;
129 }
130
131 if (codepoint <= 0xffff)
132 *d++ = (uint16_t) codepoint;
133 else {
134 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
135 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
136 }
137 last = s;
138 }
139
140 /* Invalid encoding, back up to the errant character */
141 if (state == UTF8_REJECT)
142 s -= 1;
143
144 *destoff = d - dest;
145 *codepoint0 = codepoint;
146 *state0 = state;
147 *src = last;
148
149 return s;
150}
151
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h
new file mode 100644
index 0000000..b9ab670
--- /dev/null
+++ b/include/pipes_text_cbits.h
@@ -0,0 +1,11 @@
1/*
2 * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
3 */
4
5#ifndef _pipes_text_cbits_h
6#define _pipes_text_cbits_h
7
8#define UTF8_ACCEPT 0
9#define UTF8_REJECT 12
10
11#endif
diff --git a/pipes-text.cabal b/pipes-text.cabal
index e79f168..86fbab8 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -12,7 +12,9 @@ build-type: Simple
12cabal-version: >=1.10 12cabal-version: >=1.10
13 13
14library 14library
15 exposed-modules: Pipes.Text, Pipes.Text.Parse 15 c-sources: cbits/cbits.c
16 include-dirs: include
17 exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal
16 -- other-modules: 18 -- other-modules:
17 other-extensions: RankNTypes 19 other-extensions: RankNTypes
18 build-depends: base >= 4 && < 5 , 20 build-depends: base >= 4 && < 5 ,
diff --git a/test/Test.hs b/test/Test.hs
new file mode 100644
index 0000000..1579f2b
--- /dev/null
+++ b/test/Test.hs
@@ -0,0 +1,60 @@
1import Utils
2
3import Test.QuickCheck
4import Test.QuickCheck.Monadic
5import Test.Framework (Test, testGroup, defaultMain)
6import Test.Framework.Providers.QuickCheck2 (testProperty)
7
8import Control.Exception (catch)
9import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
10import Data.Monoid (Monoid(..))
11import Data.String (fromString)
12import Data.Text.Encoding.Error
13import qualified Data.List as L
14
15import qualified Data.Bits as Bits (shiftL, shiftR)
16import qualified Data.ByteString as B
17import qualified Data.ByteString.Lazy as BL
18import qualified Data.Text as T
19import qualified Data.Text.Lazy as TL
20import qualified Data.Text.Encoding as E
21import qualified Pipes.Text.Internal as PE
22
23main :: IO ()
24main = defaultMain [tests]
25-- >>> :main -a 10000
26
27tests = testGroup "stream_decode" [
28
29 testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
30 testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed]
31
32t_utf8_incr_valid = do
33 Positive n <- arbitrary
34 forAll genUnicode $ recode n `eq` id
35 where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8
36 feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString
37 -> [T.Text]
38 feedChunksOf n f bs
39 | B.null bs = []
40 | otherwise = let (a,b) = B.splitAt n bs
41 PE.Some t _ f' = f a
42 in case f a of
43 PE.Some t _ f' -> t : feedChunksOf n f' b
44 _ -> []
45
46t_utf8_incr_mixed = do
47 Positive n <- arbitrary
48 txt <- genUnicode
49 forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt
50 where
51 roundtrip :: [B.ByteString] -> B.ByteString
52 roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where
53 go dec acc old [] = acc <> old
54 go dec acc old (bs:bss) = case dec bs of
55 PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss
56 else go dec' (acc <> E.encodeUtf8 t) new bss
57 PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss
58 else acc <> E.encodeUtf8 t <> bs' <> B.concat bss
59 chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
60 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
diff --git a/test/Utils.hs b/test/Utils.hs
new file mode 100644
index 0000000..75cd1db
--- /dev/null
+++ b/test/Utils.hs
@@ -0,0 +1,109 @@
1{-#LANGUAGE ScopedTypeVariables#-}
2module Utils where
3import Control.Exception (SomeException, bracket, bracket_, evaluate, try)
4import System.IO.Unsafe (unsafePerformIO)
5import Debug.Trace (trace)
6import Data.Bits ((.&.))
7import Data.Char (chr)
8import Data.String (IsString, fromString)
9import System.Random (Random (..), RandomGen)
10import Test.QuickCheck hiding ((.&.))
11import Test.QuickCheck.Monadic (assert, monadicIO, run)
12import qualified Data.ByteString as B
13import Pipes.Text.Internal
14
15
16
17
18
19-- Ensure that two potentially bottom values (in the sense of crashing
20-- for some inputs, not looping infinitely) either both crash, or both
21-- give comparable results for some input.
22(=^=) :: (Eq a, Show a) => a -> a -> Bool
23i =^= j = unsafePerformIO $ do
24 x <- try (evaluate i)
25 y <- try (evaluate j)
26 case (x,y) of
27 (Left (_ :: SomeException), Left (_ :: SomeException))
28 -> return True
29 (Right a, Right b) -> return (a == b)
30 e -> trace ("*** Divergence: " ++ show e) return False
31infix 4 =^=
32{-# NOINLINE (=^=) #-}
33
34-- Do two functions give the same answer?
35eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool
36eq a b s = a s =^= b s
37
38-- What about with the RHS packed?
39-- eqP :: (Eq a, Show a, Stringy s) =>
40-- (String -> a) -> (s -> a) -> String -> Word8 -> Bool
41-- eqP f g s w = eql "orig" (f s) (g t) &&
42-- eql "mini" (f s) (g mini) &&
43-- eql "head" (f sa) (g ta) &&
44-- eql "tail" (f sb) (g tb)
45-- where t = packS s
46-- mini = packSChunkSize 10 s
47-- (sa,sb) = splitAt m s
48-- (ta,tb) = splitAtS m t
49-- l = length s
50-- m | l == 0 = n
51-- | otherwise = n `mod` l
52-- n = fromIntegral w
53-- eql d a b
54-- | a =^= b = True
55-- | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False
56
57
58instance Arbitrary B.ByteString where
59 arbitrary = B.pack `fmap` arbitrary
60
61genUnicode :: IsString a => Gen a
62genUnicode = fmap fromString string where
63 string = sized $ \n ->
64 do k <- choose (0,n)
65 sequence [ char | _ <- [1..k] ]
66
67 excluding :: [a -> Bool] -> Gen a -> Gen a
68 excluding bad gen = loop
69 where
70 loop = do
71 x <- gen
72 if or (map ($ x) bad)
73 then loop
74 else return x
75
76 reserved = [lowSurrogate, highSurrogate, noncharacter]
77 lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF
78 highSurrogate c = c >= 0xD800 && c <= 0xDBFF
79 noncharacter c = masked == 0xFFFE || masked == 0xFFFF
80 where
81 masked = c .&. 0xFFFF
82
83 ascii = choose (0,0x7F)
84 plane0 = choose (0xF0, 0xFFFF)
85 plane1 = oneof [ choose (0x10000, 0x10FFF)
86 , choose (0x11000, 0x11FFF)
87 , choose (0x12000, 0x12FFF)
88 , choose (0x13000, 0x13FFF)
89 , choose (0x1D000, 0x1DFFF)
90 , choose (0x1F000, 0x1FFFF)
91 ]
92 plane2 = oneof [ choose (0x20000, 0x20FFF)
93 , choose (0x21000, 0x21FFF)
94 , choose (0x22000, 0x22FFF)
95 , choose (0x23000, 0x23FFF)
96 , choose (0x24000, 0x24FFF)
97 , choose (0x25000, 0x25FFF)
98 , choose (0x26000, 0x26FFF)
99 , choose (0x27000, 0x27FFF)
100 , choose (0x28000, 0x28FFF)
101 , choose (0x29000, 0x29FFF)
102 , choose (0x2A000, 0x2AFFF)
103 , choose (0x2B000, 0x2BFFF)
104 , choose (0x2F000, 0x2FFFF)
105 ]
106 plane14 = choose (0xE0000, 0xE0FFF)
107 planes = [ascii, plane0, plane1, plane2, plane14]
108
109 char = chr `fmap` excluding reserved (oneof planes)