diff options
-rw-r--r-- | Pipes/Text.hs | 136 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 157 | ||||
-rw-r--r-- | cbits/cbits.c | 151 | ||||
-rw-r--r-- | include/pipes_text_cbits.h | 11 | ||||
-rw-r--r-- | pipes-text.cabal | 4 | ||||
-rw-r--r-- | test/Test.hs | 60 | ||||
-rw-r--r-- | test/Utils.hs | 109 |
7 files changed, 550 insertions, 78 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index a5859a3..6845dd3 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -81,10 +81,6 @@ module Pipes.Text ( | |||
81 | filter, | 81 | filter, |
82 | scan, | 82 | scan, |
83 | encodeUtf8, | 83 | encodeUtf8, |
84 | #if MIN_VERSION_text(0,11,4) | ||
85 | pipeDecodeUtf8, | ||
86 | pipeDecodeUtf8With, | ||
87 | #endif | ||
88 | pack, | 84 | pack, |
89 | unpack, | 85 | unpack, |
90 | toCaseFold, | 86 | toCaseFold, |
@@ -119,10 +115,8 @@ module Pipes.Text ( | |||
119 | group, | 115 | group, |
120 | lines, | 116 | lines, |
121 | words, | 117 | words, |
122 | #if MIN_VERSION_text(0,11,4) | ||
123 | decodeUtf8, | 118 | decodeUtf8, |
124 | decodeUtf8With, | 119 | decodeUtf8With, |
125 | #endif | ||
126 | -- * Transformations | 120 | -- * Transformations |
127 | intersperse, | 121 | intersperse, |
128 | 122 | ||
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G | |||
167 | import Pipes | 161 | import Pipes |
168 | import qualified Pipes.ByteString as PB | 162 | import qualified Pipes.ByteString as PB |
169 | import qualified Pipes.ByteString.Parse as PBP | 163 | import qualified Pipes.ByteString.Parse as PBP |
164 | import qualified Pipes.Text.Internal as PE | ||
170 | import Pipes.Text.Parse ( | 165 | import Pipes.Text.Parse ( |
171 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) | 166 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) |
172 | import Pipes.Core (respond, Server') | 167 | import Pipes.Core (respond, Server') |
@@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) | |||
214 | {-# INLINABLE fromLazy #-} | 209 | {-# INLINABLE fromLazy #-} |
215 | 210 | ||
216 | -- | Stream text from 'stdin' | 211 | -- | Stream text from 'stdin' |
217 | stdin :: MonadIO m => Producer' Text m () | 212 | stdin :: MonadIO m => Producer' Text m (Producer ByteString m ()) |
218 | stdin = fromHandle IO.stdin | 213 | stdin = fromHandle IO.stdin |
219 | {-# INLINABLE stdin #-} | 214 | {-# INLINABLE stdin #-} |
220 | 215 | ||
221 | {-| Convert a 'IO.Handle' into a text stream using a text size | 216 | {-| Convert a 'IO.Handle' into a text stream using a text size |
222 | determined by the good sense of the text library. | 217 | determined by the good sense of the text library. |
223 | |||
224 | -} | 218 | -} |
225 | 219 | ||
226 | fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () | 220 | fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ()) |
227 | #if MIN_VERSION_text(0,11,4) | 221 | -- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)` |
228 | fromHandle h = go TE.streamDecodeUtf8 where | 222 | -- if only so that mistakes can be concentrated in one place. |
223 | -- This modifies something that was faster on an earlier iteration. | ||
224 | -- Note also that the `text` replacement system is being ignored; | ||
225 | -- with a replacement scheme one could have `Producer Text m ()` | ||
226 | -- the relation to the replacement business needs to be thought out. | ||
227 | -- The complicated type seems overmuch for the toy stdin above | ||
228 | fromHandle h = go PE.streamDecodeUtf8 B.empty where | ||
229 | act = B.hGetSome h defaultChunkSize | 229 | act = B.hGetSome h defaultChunkSize |
230 | go dec = do chunk <- liftIO act | 230 | go dec old = do chunk <- liftIO act |
231 | case dec chunk of | 231 | if B.null chunk |
232 | TE.Some text _ dec' -> do yield text | 232 | then if B.null old then return (return ()) |
233 | unless (B.null chunk) (go dec') | 233 | else return (yield old >> return ()) |
234 | else case dec chunk of | ||
235 | PE.Some text bs dec' -> | ||
236 | if T.null text then go dec' (B.append old bs) | ||
237 | else do yield text | ||
238 | go dec' B.empty | ||
239 | PE.Other text bs -> | ||
240 | if T.null text then return (do yield old | ||
241 | yield bs | ||
242 | PB.fromHandle h) | ||
243 | else do yield text | ||
244 | return (do yield bs | ||
245 | PB.fromHandle h) | ||
234 | {-# INLINE fromHandle#-} | 246 | {-# INLINE fromHandle#-} |
235 | -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as | 247 | -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as |
236 | -- the dedicated Text IO function 'hGetChunk' ; | 248 | -- the dedicated Text IO function 'hGetChunk' ; |
237 | -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" | 249 | -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" |
238 | -- runs the same as the conduit equivalent, only slightly slower | 250 | -- runs the same as the conduit equivalent, only slightly slower |
239 | -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" | 251 | -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" |
240 | #else | 252 | -- #else |
241 | fromHandle h = go where | 253 | -- fromHandle h = go where |
242 | go = do txt <- liftIO (T.hGetChunk h) | 254 | -- go = do txt <- liftIO (T.hGetChunk h) |
243 | unless (T.null txt) $ do yield txt | 255 | -- unless (T.null txt) $ do yield txt |
244 | go | 256 | -- go |
245 | {-# INLINABLE fromHandle#-} | 257 | -- {-# INLINABLE fromHandle#-} |
246 | #endif | 258 | -- #endif |
247 | {-| Stream text from a file using Pipes.Safe | 259 | {-| Stream text from a file using Pipes.Safe |
248 | 260 | ||
249 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout | 261 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout |
250 | MAIN = PUTSTRLN "HELLO WORLD" | 262 | MAIN = PUTSTRLN "HELLO WORLD" |
251 | -} | 263 | -} |
252 | 264 | ||
253 | readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () | 265 | readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ()) |
254 | readFile file = Safe.withFile file IO.ReadMode fromHandle | 266 | readFile file = Safe.withFile file IO.ReadMode fromHandle |
255 | {-# INLINABLE readFile #-} | 267 | {-# INLINABLE readFile #-} |
256 | 268 | ||
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n | |||
610 | count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) | 622 | count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) |
611 | {-# INLINABLE count #-} | 623 | {-# INLINABLE count #-} |
612 | 624 | ||
613 | #if MIN_VERSION_text(0,11,4) | ||
614 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded | 625 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded |
615 | -- into a Pipe of Text | 626 | -- into a Pipe of Text |
616 | decodeUtf8 | 627 | decodeUtf8 |
617 | :: Monad m | 628 | :: Monad m |
618 | => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 629 | => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
619 | decodeUtf8 = go TE.streamDecodeUtf8 | 630 | decodeUtf8 = decodeUtf8With Nothing |
620 | where go dec p = do | ||
621 | x <- lift (next p) | ||
622 | case x of | ||
623 | Left r -> return (return r) | ||
624 | Right (chunk, p') -> do | ||
625 | let TE.Some text l dec' = dec chunk | ||
626 | if B.null l | ||
627 | then do | ||
628 | yield text | ||
629 | go dec' p' | ||
630 | else return $ do | ||
631 | yield l | ||
632 | p' | ||
633 | {-# INLINEABLE decodeUtf8 #-} | 631 | {-# INLINEABLE decodeUtf8 #-} |
634 | 632 | ||
635 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded | 633 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded |
636 | -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ | 634 | -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ |
637 | -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" | 635 | -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" |
638 | decodeUtf8With | 636 | decodeUtf8With |
639 | :: Monad m | 637 | :: Monad m |
640 | => TE.OnDecodeError | 638 | => Maybe TE.OnDecodeError |
641 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 639 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
642 | decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) | 640 | decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where |
643 | where go dec p = do | 641 | go dec old p = do |
644 | x <- lift (next p) | 642 | x <- lift (next p) |
645 | case x of | 643 | case x of |
646 | Left r -> return (return r) | 644 | Left r -> if B.null old then return (return r) |
647 | Right (chunk, p') -> do | 645 | else return (do yield old |
648 | let TE.Some text l dec' = dec chunk | 646 | return r) |
649 | if B.null l | 647 | Right (chunk, p') -> |
650 | then do | 648 | case dec chunk of |
651 | yield text | 649 | PE.Some text l dec' -> |
652 | go dec' p' | 650 | if T.null text then go dec' (B.append old l) p' |
653 | else return $ do | 651 | else do yield text |
654 | yield l | 652 | go dec' B.empty p' |
655 | p' | 653 | PE.Other text bs -> |
654 | if T.null text then return (do yield old | ||
655 | yield bs | ||
656 | p') | ||
657 | else do yield text | ||
658 | return (do yield bs | ||
659 | p') | ||
656 | {-# INLINEABLE decodeUtf8With #-} | 660 | {-# INLINEABLE decodeUtf8With #-} |
657 | 661 | ||
658 | -- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise | 662 | |
659 | -- with any chunk that contains a sequence of bytes that is unreadable. Otherwise | ||
660 | -- only few bytes will only be moved from one chunk to the next before decoding. | ||
661 | pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r | ||
662 | pipeDecodeUtf8 = go TE.streamDecodeUtf8 | ||
663 | where go dec = do chunk <- await | ||
664 | case dec chunk of | ||
665 | TE.Some text l dec' -> do yield text | ||
666 | go dec' | ||
667 | {-# INLINEABLE pipeDecodeUtf8 #-} | ||
668 | |||
669 | -- | A simple pipe from 'ByteString' to 'Text' using a replacement function. | ||
670 | pipeDecodeUtf8With | ||
671 | :: Monad m | ||
672 | => TE.OnDecodeError | ||
673 | -> Pipe ByteString Text m r | ||
674 | pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) | ||
675 | where go dec = do chunk <- await | ||
676 | case dec chunk of | ||
677 | TE.Some text l dec' -> do yield text | ||
678 | go dec' | ||
679 | {-# INLINEABLE pipeDecodeUtf8With #-} | ||
680 | #endif | ||
681 | 663 | ||
682 | -- | Splits a 'Producer' after the given number of characters | 664 | -- | Splits a 'Producer' after the given number of characters |
683 | splitAt | 665 | splitAt |
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..05d9887 --- /dev/null +++ b/Pipes/Text/Internal.hs | |||
@@ -0,0 +1,157 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | ||
2 | UnliftedFFITypes #-} | ||
3 | -- This module lifts material from Brian O'Sullivan's text package | ||
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
5 | -- streamDecodeUtf8 | ||
6 | module Pipes.Text.Internal | ||
7 | ( Decoding(..) | ||
8 | , streamDecodeUtf8With | ||
9 | , streamDecodeUtf8 | ||
10 | ) where | ||
11 | |||
12 | import Control.Exception (evaluate, try) | ||
13 | #if __GLASGOW_HASKELL__ >= 702 | ||
14 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
15 | import Control.Monad.ST (ST, runST) | ||
16 | #else | ||
17 | import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) | ||
18 | #endif | ||
19 | import Data.Bits ((.&.)) | ||
20 | import Data.ByteString as B | ||
21 | import Data.ByteString.Internal as B | ||
22 | import Data.Text () | ||
23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) | ||
24 | import Data.Text.Internal (Text(..), safe, textP) | ||
25 | import Data.Word (Word8, Word32) | ||
26 | import Foreign.C.Types (CSize) | ||
27 | import Foreign.ForeignPtr (withForeignPtr) | ||
28 | import Foreign.Marshal.Utils (with) | ||
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
30 | import Foreign.Storable (Storable, peek, poke) | ||
31 | import GHC.Base hiding (ord) | ||
32 | import GHC.Word | ||
33 | import qualified Data.Text.Array as A | ||
34 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) | ||
35 | import GHC.Word (Word8(..), Word16(..), Word32(..)) | ||
36 | |||
37 | import Data.Text.Unsafe (unsafeDupablePerformIO) | ||
38 | |||
39 | #include "pipes_text_cbits.h" | ||
40 | |||
41 | -- | A stream oriented decoding result. | ||
42 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
43 | | Other Text ByteString | ||
44 | instance Show Decoding where | ||
45 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
46 | showString "Some " . showsPrec prec' t . | ||
47 | showChar ' ' . showsPrec prec' bs . | ||
48 | showString " _" | ||
49 | where prec = 10; prec' = prec + 1 | ||
50 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
51 | showString "Other " . showsPrec prec' t . | ||
52 | showChar ' ' . showsPrec prec' bs . | ||
53 | showString " _" | ||
54 | where prec = 10; prec' = prec + 1 | ||
55 | |||
56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
58 | |||
59 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
60 | -- encoded text that is known to be valid. | ||
61 | -- | ||
62 | -- If the input contains any invalid UTF-8 data, an exception will be | ||
63 | -- thrown (either by this function or a continuation) that cannot be | ||
64 | -- caught in pure code. For more control over the handling of invalid | ||
65 | -- data, use 'streamDecodeUtf8With'. | ||
66 | streamDecodeUtf8 :: ByteString -> Decoding | ||
67 | streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) | ||
68 | |||
69 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
70 | -- encoded text. | ||
71 | streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding | ||
72 | streamDecodeUtf8With mErr = case mErr of | ||
73 | Nothing -> decodeWith False strictDecode | ||
74 | Just onErr -> decodeWith True onErr | ||
75 | where | ||
76 | -- We create a slightly larger than necessary buffer to accommodate a | ||
77 | -- potential surrogate pair started in the last buffer | ||
78 | decodeWith replace onErr = decodeChunk 0 0 | ||
79 | where | ||
80 | decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding | ||
81 | decodeChunk codepoint0 state0 bs@(PS fp off len) = | ||
82 | runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) | ||
83 | where | ||
84 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
85 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
86 | with (0::CSize) $ \destOffPtr -> | ||
87 | with codepoint0 $ \codepointPtr -> | ||
88 | with state0 $ \statePtr -> | ||
89 | with nullPtr $ \curPtrPtr -> | ||
90 | let end = ptr `plusPtr` (off + len) | ||
91 | loop curPtr = do | ||
92 | poke curPtrPtr curPtr | ||
93 | curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
94 | curPtrPtr end codepointPtr statePtr | ||
95 | state <- peek statePtr | ||
96 | case state of | ||
97 | UTF8_REJECT -> | ||
98 | -- We encountered an encoding error | ||
99 | if replace | ||
100 | then do | ||
101 | x <- peek curPtr' | ||
102 | case onErr desc (Just x) of | ||
103 | Nothing -> loop $ curPtr' `plusPtr` 1 | ||
104 | Just c -> do | ||
105 | destOff <- peek destOffPtr | ||
106 | w <- unsafeSTToIO $ | ||
107 | unsafeWrite dest (fromIntegral destOff) (safe c) | ||
108 | poke destOffPtr (destOff + fromIntegral w) | ||
109 | poke statePtr 0 | ||
110 | loop $ curPtr' `plusPtr` 1 | ||
111 | else do | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do | ||
114 | arr <- A.unsafeFreeze dest | ||
115 | return $! textP arr 0 (fromIntegral n) | ||
116 | lastPtr <- peek curPtrPtr | ||
117 | let left = lastPtr `minusPtr` curPtr | ||
118 | return $ Other chunkText (B.drop left bs) | ||
119 | _ -> do | ||
120 | -- We encountered the end of the buffer while decoding | ||
121 | n <- peek destOffPtr | ||
122 | codepoint <- peek codepointPtr | ||
123 | chunkText <- unsafeSTToIO $ do | ||
124 | arr <- A.unsafeFreeze dest | ||
125 | return $! textP arr 0 (fromIntegral n) | ||
126 | lastPtr <- peek curPtrPtr | ||
127 | let left = lastPtr `minusPtr` curPtr | ||
128 | return $ Some chunkText (B.drop left bs) | ||
129 | (decodeChunk codepoint state) | ||
130 | in loop (ptr `plusPtr` off) | ||
131 | desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" | ||
132 | |||
133 | ord :: Char -> Int | ||
134 | ord (C# c#) = I# (ord# c#) | ||
135 | {-# INLINE ord #-} | ||
136 | |||
137 | |||
138 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
139 | unsafeWrite marr i c | ||
140 | | n < 0x10000 = do | ||
141 | A.unsafeWrite marr i (fromIntegral n) | ||
142 | return 1 | ||
143 | | otherwise = do | ||
144 | A.unsafeWrite marr i lo | ||
145 | A.unsafeWrite marr (i+1) hi | ||
146 | return 2 | ||
147 | where n = ord c | ||
148 | m = n - 0x10000 | ||
149 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
150 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
151 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
152 | {-# INLINE unsafeWrite #-} | ||
153 | |||
154 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
155 | :: MutableByteArray# s -> Ptr CSize | ||
156 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
157 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||
diff --git a/cbits/cbits.c b/cbits/cbits.c new file mode 100644 index 0000000..e0fdfd5 --- /dev/null +++ b/cbits/cbits.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | * | ||
4 | * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>. | ||
5 | * | ||
6 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | ||
7 | */ | ||
8 | |||
9 | #include <string.h> | ||
10 | #include <stdint.h> | ||
11 | #include <stdio.h> | ||
12 | #include "pipes_text_cbits.h" | ||
13 | |||
14 | |||
15 | |||
16 | #define UTF8_ACCEPT 0 | ||
17 | #define UTF8_REJECT 12 | ||
18 | |||
19 | static const uint8_t utf8d[] = { | ||
20 | /* | ||
21 | * The first part of the table maps bytes to character classes that | ||
22 | * to reduce the size of the transition table and create bitmasks. | ||
23 | */ | ||
24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
26 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
27 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | ||
29 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | ||
30 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | ||
31 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | ||
32 | |||
33 | /* | ||
34 | * The second part is a transition table that maps a combination of | ||
35 | * a state of the automaton and a character class to a state. | ||
36 | */ | ||
37 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | ||
38 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | ||
39 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | ||
40 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | ||
41 | 12,36,12,12,12,12,12,12,12,12,12,12, | ||
42 | }; | ||
43 | |||
44 | static inline uint32_t | ||
45 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | ||
46 | uint32_t type = utf8d[byte]; | ||
47 | |||
48 | *codep = (*state != UTF8_ACCEPT) ? | ||
49 | (byte & 0x3fu) | (*codep << 6) : | ||
50 | (0xff >> type) & (byte); | ||
51 | |||
52 | return *state = utf8d[256 + *state + type]; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * A best-effort decoder. Runs until it hits either end of input or | ||
57 | * the start of an invalid byte sequence. | ||
58 | * | ||
59 | * At exit, we update *destoff with the next offset to write to, *src | ||
60 | * with the next source location past the last one successfully | ||
61 | * decoded, and return the next source location to read from. | ||
62 | * | ||
63 | * Moreover, we expose the internal decoder state (state0 and | ||
64 | * codepoint0), allowing one to restart the decoder after it | ||
65 | * terminates (say, due to a partial codepoint). | ||
66 | * | ||
67 | * In particular, there are a few possible outcomes, | ||
68 | * | ||
69 | * 1) We decoded the buffer entirely: | ||
70 | * In this case we return srcend | ||
71 | * state0 == UTF8_ACCEPT | ||
72 | * | ||
73 | * 2) We met an invalid encoding | ||
74 | * In this case we return the address of the first invalid byte | ||
75 | * state0 == UTF8_REJECT | ||
76 | * | ||
77 | * 3) We reached the end of the buffer while decoding a codepoint | ||
78 | * In this case we return a pointer to the first byte of the partial codepoint | ||
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | ||
80 | * | ||
81 | */ | ||
82 | const uint8_t * | ||
83 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | ||
84 | const uint8_t **const src, | ||
85 | const uint8_t *const srcend, | ||
86 | uint32_t *codepoint0, uint32_t *state0) | ||
87 | { | ||
88 | uint16_t *d = dest + *destoff; | ||
89 | const uint8_t *s = *src, *last = *src; | ||
90 | uint32_t state = *state0; | ||
91 | uint32_t codepoint = *codepoint0; | ||
92 | |||
93 | while (s < srcend) { | ||
94 | #if defined(__i386__) || defined(__x86_64__) | ||
95 | /* | ||
96 | * This code will only work on a little-endian system that | ||
97 | * supports unaligned loads. | ||
98 | * | ||
99 | * It gives a substantial speed win on data that is purely or | ||
100 | * partly ASCII (e.g. HTML), at only a slight cost on purely | ||
101 | * non-ASCII text. | ||
102 | */ | ||
103 | |||
104 | if (state == UTF8_ACCEPT) { | ||
105 | while (s < srcend - 4) { | ||
106 | codepoint = *((uint32_t *) s); | ||
107 | if ((codepoint & 0x80808080) != 0) | ||
108 | break; | ||
109 | s += 4; | ||
110 | |||
111 | /* | ||
112 | * Tried 32-bit stores here, but the extra bit-twiddling | ||
113 | * slowed the code down. | ||
114 | */ | ||
115 | |||
116 | *d++ = (uint16_t) (codepoint & 0xff); | ||
117 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | ||
118 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | ||
119 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | ||
120 | } | ||
121 | last = s; | ||
122 | } | ||
123 | #endif | ||
124 | |||
125 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | ||
126 | if (state != UTF8_REJECT) | ||
127 | continue; | ||
128 | break; | ||
129 | } | ||
130 | |||
131 | if (codepoint <= 0xffff) | ||
132 | *d++ = (uint16_t) codepoint; | ||
133 | else { | ||
134 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | ||
135 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | ||
136 | } | ||
137 | last = s; | ||
138 | } | ||
139 | |||
140 | /* Invalid encoding, back up to the errant character */ | ||
141 | if (state == UTF8_REJECT) | ||
142 | s -= 1; | ||
143 | |||
144 | *destoff = d - dest; | ||
145 | *codepoint0 = codepoint; | ||
146 | *state0 = state; | ||
147 | *src = last; | ||
148 | |||
149 | return s; | ||
150 | } | ||
151 | |||
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h new file mode 100644 index 0000000..b9ab670 --- /dev/null +++ b/include/pipes_text_cbits.h | |||
@@ -0,0 +1,11 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | */ | ||
4 | |||
5 | #ifndef _pipes_text_cbits_h | ||
6 | #define _pipes_text_cbits_h | ||
7 | |||
8 | #define UTF8_ACCEPT 0 | ||
9 | #define UTF8_REJECT 12 | ||
10 | |||
11 | #endif | ||
diff --git a/pipes-text.cabal b/pipes-text.cabal index e79f168..86fbab8 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -12,7 +12,9 @@ build-type: Simple | |||
12 | cabal-version: >=1.10 | 12 | cabal-version: >=1.10 |
13 | 13 | ||
14 | library | 14 | library |
15 | exposed-modules: Pipes.Text, Pipes.Text.Parse | 15 | c-sources: cbits/cbits.c |
16 | include-dirs: include | ||
17 | exposed-modules: Pipes.Text, Pipes.Text.Parse, Pipes.Text.Internal | ||
16 | -- other-modules: | 18 | -- other-modules: |
17 | other-extensions: RankNTypes | 19 | other-extensions: RankNTypes |
18 | build-depends: base >= 4 && < 5 , | 20 | build-depends: base >= 4 && < 5 , |
diff --git a/test/Test.hs b/test/Test.hs new file mode 100644 index 0000000..1579f2b --- /dev/null +++ b/test/Test.hs | |||
@@ -0,0 +1,60 @@ | |||
1 | import Utils | ||
2 | |||
3 | import Test.QuickCheck | ||
4 | import Test.QuickCheck.Monadic | ||
5 | import Test.Framework (Test, testGroup, defaultMain) | ||
6 | import Test.Framework.Providers.QuickCheck2 (testProperty) | ||
7 | |||
8 | import Control.Exception (catch) | ||
9 | import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord) | ||
10 | import Data.Monoid (Monoid(..)) | ||
11 | import Data.String (fromString) | ||
12 | import Data.Text.Encoding.Error | ||
13 | import qualified Data.List as L | ||
14 | |||
15 | import qualified Data.Bits as Bits (shiftL, shiftR) | ||
16 | import qualified Data.ByteString as B | ||
17 | import qualified Data.ByteString.Lazy as BL | ||
18 | import qualified Data.Text as T | ||
19 | import qualified Data.Text.Lazy as TL | ||
20 | import qualified Data.Text.Encoding as E | ||
21 | import qualified Pipes.Text.Internal as PE | ||
22 | |||
23 | main :: IO () | ||
24 | main = defaultMain [tests] | ||
25 | -- >>> :main -a 10000 | ||
26 | |||
27 | tests = testGroup "stream_decode" [ | ||
28 | |||
29 | testProperty "t_utf8_incr_valid" t_utf8_incr_valid, | ||
30 | testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed] | ||
31 | |||
32 | t_utf8_incr_valid = do | ||
33 | Positive n <- arbitrary | ||
34 | forAll genUnicode $ recode n `eq` id | ||
35 | where recode n = T.concat . feedChunksOf n PE.streamDecodeUtf8 . E.encodeUtf8 | ||
36 | feedChunksOf :: Int -> (B.ByteString -> PE.Decoding) -> B.ByteString | ||
37 | -> [T.Text] | ||
38 | feedChunksOf n f bs | ||
39 | | B.null bs = [] | ||
40 | | otherwise = let (a,b) = B.splitAt n bs | ||
41 | PE.Some t _ f' = f a | ||
42 | in case f a of | ||
43 | PE.Some t _ f' -> t : feedChunksOf n f' b | ||
44 | _ -> [] | ||
45 | |||
46 | t_utf8_incr_mixed = do | ||
47 | Positive n <- arbitrary | ||
48 | txt <- genUnicode | ||
49 | forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt | ||
50 | where | ||
51 | roundtrip :: [B.ByteString] -> B.ByteString | ||
52 | roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where | ||
53 | go dec acc old [] = acc <> old | ||
54 | go dec acc old (bs:bss) = case dec bs of | ||
55 | PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss | ||
56 | else go dec' (acc <> E.encodeUtf8 t) new bss | ||
57 | PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss | ||
58 | else acc <> E.encodeUtf8 t <> bs' <> B.concat bss | ||
59 | chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b | ||
60 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | ||
diff --git a/test/Utils.hs b/test/Utils.hs new file mode 100644 index 0000000..75cd1db --- /dev/null +++ b/test/Utils.hs | |||
@@ -0,0 +1,109 @@ | |||
1 | {-#LANGUAGE ScopedTypeVariables#-} | ||
2 | module Utils where | ||
3 | import Control.Exception (SomeException, bracket, bracket_, evaluate, try) | ||
4 | import System.IO.Unsafe (unsafePerformIO) | ||
5 | import Debug.Trace (trace) | ||
6 | import Data.Bits ((.&.)) | ||
7 | import Data.Char (chr) | ||
8 | import Data.String (IsString, fromString) | ||
9 | import System.Random (Random (..), RandomGen) | ||
10 | import Test.QuickCheck hiding ((.&.)) | ||
11 | import Test.QuickCheck.Monadic (assert, monadicIO, run) | ||
12 | import qualified Data.ByteString as B | ||
13 | import Pipes.Text.Internal | ||
14 | |||
15 | |||
16 | |||
17 | |||
18 | |||
19 | -- Ensure that two potentially bottom values (in the sense of crashing | ||
20 | -- for some inputs, not looping infinitely) either both crash, or both | ||
21 | -- give comparable results for some input. | ||
22 | (=^=) :: (Eq a, Show a) => a -> a -> Bool | ||
23 | i =^= j = unsafePerformIO $ do | ||
24 | x <- try (evaluate i) | ||
25 | y <- try (evaluate j) | ||
26 | case (x,y) of | ||
27 | (Left (_ :: SomeException), Left (_ :: SomeException)) | ||
28 | -> return True | ||
29 | (Right a, Right b) -> return (a == b) | ||
30 | e -> trace ("*** Divergence: " ++ show e) return False | ||
31 | infix 4 =^= | ||
32 | {-# NOINLINE (=^=) #-} | ||
33 | |||
34 | -- Do two functions give the same answer? | ||
35 | eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool | ||
36 | eq a b s = a s =^= b s | ||
37 | |||
38 | -- What about with the RHS packed? | ||
39 | -- eqP :: (Eq a, Show a, Stringy s) => | ||
40 | -- (String -> a) -> (s -> a) -> String -> Word8 -> Bool | ||
41 | -- eqP f g s w = eql "orig" (f s) (g t) && | ||
42 | -- eql "mini" (f s) (g mini) && | ||
43 | -- eql "head" (f sa) (g ta) && | ||
44 | -- eql "tail" (f sb) (g tb) | ||
45 | -- where t = packS s | ||
46 | -- mini = packSChunkSize 10 s | ||
47 | -- (sa,sb) = splitAt m s | ||
48 | -- (ta,tb) = splitAtS m t | ||
49 | -- l = length s | ||
50 | -- m | l == 0 = n | ||
51 | -- | otherwise = n `mod` l | ||
52 | -- n = fromIntegral w | ||
53 | -- eql d a b | ||
54 | -- | a =^= b = True | ||
55 | -- | otherwise = trace (d ++ ": " ++ show a ++ " /= " ++ show b) False | ||
56 | |||
57 | |||
58 | instance Arbitrary B.ByteString where | ||
59 | arbitrary = B.pack `fmap` arbitrary | ||
60 | |||
61 | genUnicode :: IsString a => Gen a | ||
62 | genUnicode = fmap fromString string where | ||
63 | string = sized $ \n -> | ||
64 | do k <- choose (0,n) | ||
65 | sequence [ char | _ <- [1..k] ] | ||
66 | |||
67 | excluding :: [a -> Bool] -> Gen a -> Gen a | ||
68 | excluding bad gen = loop | ||
69 | where | ||
70 | loop = do | ||
71 | x <- gen | ||
72 | if or (map ($ x) bad) | ||
73 | then loop | ||
74 | else return x | ||
75 | |||
76 | reserved = [lowSurrogate, highSurrogate, noncharacter] | ||
77 | lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF | ||
78 | highSurrogate c = c >= 0xD800 && c <= 0xDBFF | ||
79 | noncharacter c = masked == 0xFFFE || masked == 0xFFFF | ||
80 | where | ||
81 | masked = c .&. 0xFFFF | ||
82 | |||
83 | ascii = choose (0,0x7F) | ||
84 | plane0 = choose (0xF0, 0xFFFF) | ||
85 | plane1 = oneof [ choose (0x10000, 0x10FFF) | ||
86 | , choose (0x11000, 0x11FFF) | ||
87 | , choose (0x12000, 0x12FFF) | ||
88 | , choose (0x13000, 0x13FFF) | ||
89 | , choose (0x1D000, 0x1DFFF) | ||
90 | , choose (0x1F000, 0x1FFFF) | ||
91 | ] | ||
92 | plane2 = oneof [ choose (0x20000, 0x20FFF) | ||
93 | , choose (0x21000, 0x21FFF) | ||
94 | , choose (0x22000, 0x22FFF) | ||
95 | , choose (0x23000, 0x23FFF) | ||
96 | , choose (0x24000, 0x24FFF) | ||
97 | , choose (0x25000, 0x25FFF) | ||
98 | , choose (0x26000, 0x26FFF) | ||
99 | , choose (0x27000, 0x27FFF) | ||
100 | , choose (0x28000, 0x28FFF) | ||
101 | , choose (0x29000, 0x29FFF) | ||
102 | , choose (0x2A000, 0x2AFFF) | ||
103 | , choose (0x2B000, 0x2BFFF) | ||
104 | , choose (0x2F000, 0x2FFFF) | ||
105 | ] | ||
106 | plane14 = choose (0xE0000, 0xE0FFF) | ||
107 | planes = [ascii, plane0, plane1, plane2, plane14] | ||
108 | |||
109 | char = chr `fmap` excluding reserved (oneof planes) | ||