diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:05:12 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-01-14 22:05:12 -0500 |
commit | 3694350ac7b9c42fd64e0092a74cf0471a080058 (patch) | |
tree | 38ddadda59a3808422fc432d37b886c456adcb1d | |
parent | cd4fd5dd5405ad8e324f43ee2bc81822bdece16c (diff) | |
download | text-pipes-3694350ac7b9c42fd64e0092a74cf0471a080058.tar.gz text-pipes-3694350ac7b9c42fd64e0092a74cf0471a080058.tar.zst text-pipes-3694350ac7b9c42fd64e0092a74cf0471a080058.zip |
Use clunky Data.Text.IO when bytestring is not explicit
-rw-r--r-- | Pipes/Text.hs | 37 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 126 | ||||
-rw-r--r-- | bench/IO.hs | 20 | ||||
-rw-r--r-- | cbits/cbits.c | 113 | ||||
-rw-r--r-- | pipes-text.cabal | 7 | ||||
-rw-r--r-- | test/Test.hs | 17 |
6 files changed, 215 insertions, 105 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 74d2023..cf493e9 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -1,4 +1,4 @@ | |||
1 | {-# LANGUAGE RankNTypes, TypeFamilies #-} | 1 | {-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-} |
2 | 2 | ||
3 | {-| This module provides @pipes@ utilities for \"text streams\", which are | 3 | {-| This module provides @pipes@ utilities for \"text streams\", which are |
4 | streams of 'Text' chunks. The individual chunks are uniformly @strict@, but | 4 | streams of 'Text' chunks. The individual chunks are uniformly @strict@, but |
@@ -206,30 +206,36 @@ import Prelude hiding ( | |||
206 | -- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's | 206 | -- | Convert a lazy 'TL.Text' into a 'Producer' of strict 'Text's |
207 | fromLazy :: (Monad m) => TL.Text -> Producer' Text m () | 207 | fromLazy :: (Monad m) => TL.Text -> Producer' Text m () |
208 | fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) | 208 | fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) |
209 | {-# INLINABLE fromLazy #-} | 209 | {-# INLINE fromLazy #-} |
210 | 210 | ||
211 | -- | Stream text from 'stdin' | 211 | -- | Stream text from 'stdin' |
212 | stdin :: MonadIO m => Producer Text m (Producer ByteString m ()) | 212 | stdin :: MonadIO m => Producer Text m () |
213 | stdin = fromHandle IO.stdin | 213 | stdin = fromHandle IO.stdin |
214 | {-# INLINABLE stdin #-} | 214 | {-# INLINE stdin #-} |
215 | 215 | ||
216 | {-| Convert a 'IO.Handle' into a text stream using a text size | 216 | {-| Convert a 'IO.Handle' into a text stream using a text size |
217 | determined by the good sense of the text library. | 217 | determined by the good sense of the text library; note that this |
218 | is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ | ||
219 | but uses the system encoding and has other `Data.Text.IO` features | ||
218 | -} | 220 | -} |
219 | 221 | ||
220 | fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ()) | 222 | fromHandle :: MonadIO m => IO.Handle -> Producer Text m () |
221 | fromHandle h = decodeUtf8 (PB.fromHandle h) | 223 | fromHandle h = go where |
222 | {-# INLINE fromHandle#-} | 224 | go = do txt <- liftIO (T.hGetChunk h) |
225 | unless (T.null txt) $ do yield txt | ||
226 | go | ||
227 | {-# INLINABLE fromHandle#-} | ||
223 | 228 | ||
224 | {-| Stream text from a file using Pipes.Safe | 229 | |
230 | {-| Stream text from a file in the simple fashion of @Data.Text.IO@ | ||
225 | 231 | ||
226 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout | 232 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout |
227 | MAIN = PUTSTRLN "HELLO WORLD" | 233 | MAIN = PUTSTRLN "HELLO WORLD" |
228 | -} | 234 | -} |
229 | 235 | ||
230 | readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ()) | 236 | readFile :: MonadSafe m => FilePath -> Producer Text m () |
231 | readFile file = Safe.withFile file IO.ReadMode fromHandle | 237 | readFile file = Safe.withFile file IO.ReadMode fromHandle |
232 | {-# INLINABLE readFile #-} | 238 | {-# INLINE readFile #-} |
233 | 239 | ||
234 | {-| Stream lines of text from stdin (for testing in ghci etc.) | 240 | {-| Stream lines of text from stdin (for testing in ghci etc.) |
235 | 241 | ||
@@ -249,7 +255,7 @@ stdinLn = go where | |||
249 | txt <- liftIO (T.hGetLine IO.stdin) | 255 | txt <- liftIO (T.hGetLine IO.stdin) |
250 | yield txt | 256 | yield txt |
251 | go | 257 | go |
252 | 258 | {-# INLINABLE stdinLn #-} | |
253 | 259 | ||
254 | {-| Stream text to 'stdout' | 260 | {-| Stream text to 'stdout' |
255 | 261 | ||
@@ -305,6 +311,7 @@ toHandle h = for cat (liftIO . T.hPutStr h) | |||
305 | -- | Stream text into a file. Uses @pipes-safe@. | 311 | -- | Stream text into a file. Uses @pipes-safe@. |
306 | writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () | 312 | writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () |
307 | writeFile file = Safe.withFile file IO.WriteMode toHandle | 313 | writeFile file = Safe.withFile file IO.WriteMode toHandle |
314 | {-# INLINE writeFile #-} | ||
308 | 315 | ||
309 | -- | Apply a transformation to each 'Char' in the stream | 316 | -- | Apply a transformation to each 'Char' in the stream |
310 | map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r | 317 | map :: (Monad m) => (Char -> Char) -> Pipe Text Text m r |
@@ -592,10 +599,10 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) | |||
592 | 599 | ||
593 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 600 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
594 | decodeUtf8 = go B.empty PE.streamDecodeUtf8 where | 601 | decodeUtf8 = go B.empty PE.streamDecodeUtf8 where |
595 | go carry dec0 p = do | 602 | go !carry dec0 p = do |
596 | x <- lift (next p) | 603 | x <- lift (next p) |
597 | case x of Left r -> if B.null carry | 604 | case x of Left r -> if B.null carry |
598 | then return (return r) -- all input was consumed | 605 | then return (return r) -- all bytestrinput was consumed |
599 | else return (do yield carry -- a potentially valid fragment remains | 606 | else return (do yield carry -- a potentially valid fragment remains |
600 | return r) | 607 | return r) |
601 | 608 | ||
@@ -605,6 +612,8 @@ decodeUtf8 = go B.empty PE.streamDecodeUtf8 where | |||
605 | PE.Other text bs -> do yield text | 612 | PE.Other text bs -> do yield text |
606 | return (do yield bs -- an invalid blob remains | 613 | return (do yield bs -- an invalid blob remains |
607 | p') | 614 | p') |
615 | {-# INLINABLE decodeUtf8 #-} | ||
616 | |||
608 | 617 | ||
609 | -- | Splits a 'Producer' after the given number of characters | 618 | -- | Splits a 'Producer' after the given number of characters |
610 | splitAt | 619 | splitAt |
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 73d6fa4..7e5b044 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -6,36 +6,25 @@ | |||
6 | module Pipes.Text.Internal | 6 | module Pipes.Text.Internal |
7 | ( Decoding(..) | 7 | ( Decoding(..) |
8 | , streamDecodeUtf8 | 8 | , streamDecodeUtf8 |
9 | , decodeSomeUtf8 | ||
9 | ) where | 10 | ) where |
10 | |||
11 | import Control.Exception (evaluate, try) | ||
12 | #if __GLASGOW_HASKELL__ >= 702 | ||
13 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | 11 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) |
14 | import Control.Monad.ST (ST, runST) | 12 | import Control.Monad.ST (ST, runST) |
15 | #else | ||
16 | import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) | ||
17 | #endif | ||
18 | import Data.Bits ((.&.)) | 13 | import Data.Bits ((.&.)) |
19 | import Data.ByteString as B | 14 | import Data.ByteString as B |
20 | import Data.ByteString.Internal as B | 15 | import Data.ByteString.Internal as B |
21 | import Data.Text () | 16 | import qualified Data.Text as T (null) |
22 | import qualified Data.Text as T | 17 | import Data.Text.Encoding.Error () |
23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) | 18 | import Data.Text.Internal (Text, textP) |
24 | import Data.Text.Internal (Text(..), safe, textP) | ||
25 | import Data.Word (Word8, Word32) | ||
26 | import Foreign.C.Types (CSize) | 19 | import Foreign.C.Types (CSize) |
27 | import Foreign.ForeignPtr (withForeignPtr) | 20 | import Foreign.ForeignPtr (withForeignPtr) |
28 | import Foreign.Marshal.Utils (with) | 21 | import Foreign.Marshal.Utils (with) |
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | 22 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) |
30 | import Foreign.Storable (Storable, peek, poke) | 23 | import Foreign.Storable (Storable, peek, poke) |
31 | import GHC.Base hiding (ord) | 24 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) |
32 | import GHC.Word | 25 | import GHC.Word (Word8, Word32) |
33 | import qualified Data.Text.Array as A | 26 | import qualified Data.Text.Array as A |
34 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) | 27 | |
35 | import GHC.Word (Word8(..), Word16(..), Word32(..)) | ||
36 | |||
37 | import Data.Text.Unsafe (unsafeDupablePerformIO) | ||
38 | |||
39 | #include "pipes_text_cbits.h" | 28 | #include "pipes_text_cbits.h" |
40 | 29 | ||
41 | -- | A stream oriented decoding result. | 30 | -- | A stream oriented decoding result. |
@@ -52,44 +41,102 @@ instance Show Decoding where | |||
52 | showChar ' ' . showsPrec prec' bs . | 41 | showChar ' ' . showsPrec prec' bs . |
53 | showString " _" | 42 | showString " _" |
54 | where prec = 10; prec' = prec + 1 | 43 | where prec = 10; prec' = prec + 1 |
55 | 44 | ||
56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | 45 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) |
57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | 46 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) |
58 | 47 | ||
59 | streamDecodeUtf8 :: ByteString -> Decoding | 48 | streamDecodeUtf8 :: ByteString -> Decoding |
60 | streamDecodeUtf8 = decodeChunk B.empty 0 0 | 49 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 |
50 | where | ||
51 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
52 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
53 | runST $ do marray <- A.new (len+1) | ||
54 | unsafeIOToST (decodeChunkToBuffer marray) | ||
55 | where | ||
56 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
57 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
58 | with (0::CSize) $ \destOffPtr -> | ||
59 | with codepoint0 $ \codepointPtr -> | ||
60 | with state0 $ \statePtr -> | ||
61 | with nullPtr $ \curPtrPtr -> | ||
62 | do let end = ptr `plusPtr` (off + len) | ||
63 | curPtr = ptr `plusPtr` off | ||
64 | poke curPtrPtr curPtr | ||
65 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
66 | state <- peek statePtr | ||
67 | lastPtr <- peek curPtrPtr | ||
68 | codepoint <- peek codepointPtr | ||
69 | n <- peek destOffPtr | ||
70 | chunkText <- mkText dest n | ||
71 | let left = lastPtr `minusPtr` curPtr | ||
72 | remaining = B.drop left bs | ||
73 | accum = if T.null chunkText then B.append old remaining else remaining | ||
74 | return $! case state of | ||
75 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
76 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
77 | {-# INLINE decodeChunkToBuffer #-} | ||
78 | {-# INLINE decodeChunkUtf8 #-} | ||
79 | {-# INLINE streamDecodeUtf8 #-} | ||
61 | 80 | ||
62 | decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | 81 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) |
63 | decodeChunk old codepoint0 state0 bs@(PS fp off len) = | 82 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do |
64 | runST $ do marray <- A.new (len+1) | 83 | dest <- A.new (len+1) |
65 | unsafeIOToST (decodeChunkToBuffer marray) | 84 | unsafeIOToST $ |
66 | where | 85 | withForeignPtr fp $ \ptr -> |
67 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | 86 | with (0::CSize) $ \destOffPtr -> |
68 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | 87 | with (0::CodePoint) $ \codepointPtr -> |
69 | with (0::CSize) $ \destOffPtr -> | 88 | with (0::DecoderState) $ \statePtr -> |
70 | with codepoint0 $ \codepointPtr -> | 89 | with nullPtr $ \curPtrPtr -> |
71 | with state0 $ \statePtr -> | ||
72 | with nullPtr $ \curPtrPtr -> | ||
73 | do let end = ptr `plusPtr` (off + len) | 90 | do let end = ptr `plusPtr` (off + len) |
74 | curPtr = ptr `plusPtr` off | 91 | curPtr = ptr `plusPtr` off |
75 | poke curPtrPtr curPtr | 92 | poke curPtrPtr curPtr |
76 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | 93 | c_decode_utf8_with_state (A.maBA dest) destOffPtr |
94 | curPtrPtr end codepointPtr statePtr | ||
77 | state <- peek statePtr | 95 | state <- peek statePtr |
78 | lastPtr <- peek curPtrPtr | 96 | lastPtr <- peek curPtrPtr |
79 | codepoint <- peek codepointPtr | 97 | codepoint <- peek codepointPtr |
80 | n <- peek destOffPtr | 98 | n <- peek destOffPtr |
81 | chunkText <- mkText dest n | 99 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest |
100 | return $! textP arr 0 (fromIntegral n) | ||
82 | let left = lastPtr `minusPtr` curPtr | 101 | let left = lastPtr `minusPtr` curPtr |
83 | remaining = B.drop left bs | 102 | remaining = B.drop left bs |
84 | accum = if T.null chunkText then B.append old remaining else remaining | 103 | return $! (chunkText, remaining) |
85 | return $ case state of | 104 | {-# INLINE decodeSomeUtf8 #-} |
86 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | 105 | |
87 | _ -> Some chunkText accum (decodeChunk accum codepoint state) | 106 | -- decodeSomeUtf8 :: ByteString -> (Text, ByteString) |
107 | -- decodeSomeUtf8 bs@(PS fp off len) = | ||
108 | -- runST $ do marray <- A.new (len+1) | ||
109 | -- unsafeIOToST (decodeChunkToBuffer marray) | ||
110 | -- | ||
111 | -- where | ||
112 | -- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) | ||
113 | -- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
114 | -- with (0::CSize) $ \destOffPtr -> | ||
115 | -- with (0::CodePoint) $ \codepointPtr -> | ||
116 | -- with (0::DecoderState) $ \statePtr -> | ||
117 | -- with nullPtr $ \curPtrPtr -> | ||
118 | -- do let end = ptr `plusPtr` (off + len) | ||
119 | -- curPtr = ptr `plusPtr` off | ||
120 | -- poke curPtrPtr curPtr | ||
121 | -- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
122 | -- state <- peek statePtr | ||
123 | -- lastPtr <- peek curPtrPtr | ||
124 | -- codepoint <- peek codepointPtr | ||
125 | -- n <- peek destOffPtr | ||
126 | -- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
127 | -- return $! textP arr 0 (fromIntegral n) | ||
128 | -- let left = lastPtr `minusPtr` curPtr | ||
129 | -- remaining = B.drop left bs | ||
130 | -- return $! (chunkText, remaining) | ||
131 | -- {-# INLINE decodeChunkToBuffer #-} | ||
132 | -- {-# INLINE decodeSomeUtf8 #-} | ||
133 | |||
88 | 134 | ||
89 | 135 | ||
90 | mkText :: A.MArray s -> CSize -> IO Text | 136 | mkText :: A.MArray s -> CSize -> IO Text |
91 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | 137 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest |
92 | return $! textP arr 0 (fromIntegral n) | 138 | return $! textP arr 0 (fromIntegral n) |
139 | {-# INLINE mkText #-} | ||
93 | 140 | ||
94 | ord :: Char -> Int | 141 | ord :: Char -> Int |
95 | ord (C# c#) = I# (ord# c#) | 142 | ord (C# c#) = I# (ord# c#) |
@@ -107,6 +154,7 @@ unsafeWrite marr i c | |||
107 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | 154 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 |
108 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | 155 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 |
109 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | 156 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) |
157 | {-# INLINE shiftR #-} | ||
110 | {-# INLINE unsafeWrite #-} | 158 | {-# INLINE unsafeWrite #-} |
111 | 159 | ||
112 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | 160 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state |
diff --git a/bench/IO.hs b/bench/IO.hs new file mode 100644 index 0000000..b3a52f6 --- /dev/null +++ b/bench/IO.hs | |||
@@ -0,0 +1,20 @@ | |||
1 | import qualified Data.Text.IO as T | ||
2 | import qualified Data.Text as T | ||
3 | import qualified Data.Text.Lazy.IO as TL | ||
4 | import qualified Data.Text.Lazy as TL | ||
5 | |||
6 | import Pipes | ||
7 | import qualified Pipes.Text as TP | ||
8 | import qualified Pipes.ByteString as BP | ||
9 | import Pipes.Safe | ||
10 | |||
11 | main = textaction | ||
12 | big = "../../examples/txt/words2.txt" | ||
13 | |||
14 | textaction = T.readFile big >>= T.putStrLn | ||
15 | pipeaction = runEffect $ for ((TP.readFile big) >> return ()) (lift . T.putStrLn) | ||
16 | |||
17 | |||
18 | |||
19 | |||
20 | |||
diff --git a/cbits/cbits.c b/cbits/cbits.c index e0fdfd5..c11645b 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c | |||
@@ -79,30 +79,38 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | |||
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | 79 | * state0 != UTF8_ACCEPT, UTF8_REJECT |
80 | * | 80 | * |
81 | */ | 81 | */ |
82 | const uint8_t * | 82 | |
83 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | 83 | #if defined(__GNUC__) || defined(__clang__) |
84 | const uint8_t **const src, | 84 | static inline uint8_t const * |
85 | const uint8_t *const srcend, | 85 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, |
86 | uint32_t *codepoint0, uint32_t *state0) | 86 | const uint8_t const **src, const uint8_t const *srcend, |
87 | uint32_t *codepoint0, uint32_t *state0) | ||
88 | __attribute((always_inline)); | ||
89 | #endif | ||
90 | |||
91 | static inline uint8_t const * | ||
92 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
93 | const uint8_t const **src, const uint8_t const *srcend, | ||
94 | uint32_t *codepoint0, uint32_t *state0) | ||
87 | { | 95 | { |
88 | uint16_t *d = dest + *destoff; | 96 | uint16_t *d = dest + *destoff; |
89 | const uint8_t *s = *src, *last = *src; | 97 | const uint8_t *s = *src, *last = *src; |
90 | uint32_t state = *state0; | 98 | uint32_t state = *state0; |
91 | uint32_t codepoint = *codepoint0; | 99 | uint32_t codepoint = *codepoint0; |
92 | 100 | ||
93 | while (s < srcend) { | 101 | while (s < srcend) { |
94 | #if defined(__i386__) || defined(__x86_64__) | 102 | #if defined(__i386__) || defined(__x86_64__) |
95 | /* | 103 | /* |
96 | * This code will only work on a little-endian system that | 104 | * This code will only work on a little-endian system that |
97 | * supports unaligned loads. | 105 | * supports unaligned loads. |
98 | * | 106 | * |
99 | * It gives a substantial speed win on data that is purely or | 107 | * It gives a substantial speed win on data that is purely or |
100 | * partly ASCII (e.g. HTML), at only a slight cost on purely | 108 | * partly ASCII (e.g. HTML), at only a slight cost on purely |
101 | * non-ASCII text. | 109 | * non-ASCII text. |
102 | */ | 110 | */ |
103 | 111 | ||
104 | if (state == UTF8_ACCEPT) { | 112 | if (state == UTF8_ACCEPT) { |
105 | while (s < srcend - 4) { | 113 | while (s < srcend - 4) { |
106 | codepoint = *((uint32_t *) s); | 114 | codepoint = *((uint32_t *) s); |
107 | if ((codepoint & 0x80808080) != 0) | 115 | if ((codepoint & 0x80808080) != 0) |
108 | break; | 116 | break; |
@@ -117,35 +125,44 @@ _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | |||
117 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | 125 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); |
118 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | 126 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); |
119 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | 127 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); |
120 | } | 128 | } |
121 | last = s; | 129 | last = s; |
122 | } | 130 | } |
123 | #endif | 131 | #endif |
124 | 132 | ||
125 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | 133 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { |
126 | if (state != UTF8_REJECT) | 134 | if (state != UTF8_REJECT) |
127 | continue; | 135 | continue; |
128 | break; | 136 | break; |
129 | } | 137 | } |
130 | 138 | ||
131 | if (codepoint <= 0xffff) | 139 | if (codepoint <= 0xffff) |
132 | *d++ = (uint16_t) codepoint; | 140 | *d++ = (uint16_t) codepoint; |
133 | else { | 141 | else { |
134 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | 142 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); |
135 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | 143 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); |
136 | } | 144 | } |
137 | last = s; | 145 | last = s; |
138 | } | 146 | } |
139 | 147 | ||
140 | /* Invalid encoding, back up to the errant character */ | 148 | *destoff = d - dest; |
141 | if (state == UTF8_REJECT) | 149 | *codepoint0 = codepoint; |
142 | s -= 1; | 150 | *state0 = state; |
143 | 151 | *src = last; | |
144 | *destoff = d - dest; | 152 | |
145 | *codepoint0 = codepoint; | 153 | return s; |
146 | *state0 = state; | 154 | } |
147 | *src = last; | 155 | |
148 | 156 | uint8_t const * | |
149 | return s; | 157 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, |
158 | const uint8_t const **src, | ||
159 | const uint8_t const *srcend, | ||
160 | uint32_t *codepoint0, uint32_t *state0) | ||
161 | { | ||
162 | uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, | ||
163 | codepoint0, state0); | ||
164 | if (*state0 == UTF8_REJECT) | ||
165 | ret -=1; | ||
166 | return ret; | ||
150 | } | 167 | } |
151 | 168 | ||
diff --git a/pipes-text.cabal b/pipes-text.cabal index 86fbab8..b4388be 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -25,6 +25,9 @@ library | |||
25 | pipes-bytestring >= 1.0 && < 1.2, | 25 | pipes-bytestring >= 1.0 && < 1.2, |
26 | transformers >= 0.3 && < 0.4, | 26 | transformers >= 0.3 && < 0.4, |
27 | text >=0.11 && < 0.12, | 27 | text >=0.11 && < 0.12, |
28 | bytestring >=0.10 && < 0.11 | 28 | bytestring >=0.10 && < 0.11, |
29 | vector, | ||
30 | void | ||
29 | -- hs-source-dirs: | 31 | -- hs-source-dirs: |
30 | default-language: Haskell2010 \ No newline at end of file | 32 | default-language: Haskell2010 |
33 | ghc-options: -O2 | ||
diff --git a/test/Test.hs b/test/Test.hs index f2bf17b..373bafb 100644 --- a/test/Test.hs +++ b/test/Test.hs | |||
@@ -27,11 +27,11 @@ import qualified Pipes as P | |||
27 | main :: IO () | 27 | main :: IO () |
28 | main = defaultMain [tests] | 28 | main = defaultMain [tests] |
29 | -- >>> :main -a 10000 | 29 | -- >>> :main -a 10000 |
30 | |||
31 | tests = testGroup "stream_decode" [ | 30 | tests = testGroup "stream_decode" [ |
32 | -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, | 31 | -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, |
33 | testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed , | 32 | testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed , |
34 | testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe] | 33 | testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe, |
34 | testProperty "t_utf8_dec_some" t_utf8_dec_some] | ||
35 | 35 | ||
36 | t_utf8_incr_valid = do | 36 | t_utf8_incr_valid = do |
37 | Positive n <- arbitrary | 37 | Positive n <- arbitrary |
@@ -82,6 +82,19 @@ t_utf8_incr_pipe = do | |||
82 | chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b | 82 | chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b |
83 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | 83 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append |
84 | 84 | ||
85 | -- | ||
86 | t_utf8_dec_some = do | ||
87 | Positive m <- arbitrary | ||
88 | txt <- genUnicode | ||
89 | let bytesLength = mod 10 m :: Int | ||
90 | forAll (vector bytesLength) $ | ||
91 | (roundtrip . appendBytes txt) | ||
92 | `eq` | ||
93 | appendBytes txt | ||
94 | where | ||
95 | roundtrip bs = case PE.decodeSomeUtf8 bs of | ||
96 | (txt,bys) -> E.encodeUtf8 txt <> bys | ||
97 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | ||
85 | 98 | ||
86 | 99 | ||
87 | 100 | ||