diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-23 13:02:49 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-23 13:02:49 -0500 |
commit | 8c48280926efffc0ca52a5d9ca796d639d053379 (patch) | |
tree | 972ca8955b5581d634663424e973e56fa4487fe5 /Pipes | |
parent | 8853a440e37523bae8cb46827d0d2d356bad5c46 (diff) | |
download | text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip |
variant using text internals in place of text streamDecodeUtf8
Diffstat (limited to 'Pipes')
-rw-r--r-- | Pipes/Text.hs | 136 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 157 |
2 files changed, 216 insertions, 77 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index a5859a3..6845dd3 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -81,10 +81,6 @@ module Pipes.Text ( | |||
81 | filter, | 81 | filter, |
82 | scan, | 82 | scan, |
83 | encodeUtf8, | 83 | encodeUtf8, |
84 | #if MIN_VERSION_text(0,11,4) | ||
85 | pipeDecodeUtf8, | ||
86 | pipeDecodeUtf8With, | ||
87 | #endif | ||
88 | pack, | 84 | pack, |
89 | unpack, | 85 | unpack, |
90 | toCaseFold, | 86 | toCaseFold, |
@@ -119,10 +115,8 @@ module Pipes.Text ( | |||
119 | group, | 115 | group, |
120 | lines, | 116 | lines, |
121 | words, | 117 | words, |
122 | #if MIN_VERSION_text(0,11,4) | ||
123 | decodeUtf8, | 118 | decodeUtf8, |
124 | decodeUtf8With, | 119 | decodeUtf8With, |
125 | #endif | ||
126 | -- * Transformations | 120 | -- * Transformations |
127 | intersperse, | 121 | intersperse, |
128 | 122 | ||
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G | |||
167 | import Pipes | 161 | import Pipes |
168 | import qualified Pipes.ByteString as PB | 162 | import qualified Pipes.ByteString as PB |
169 | import qualified Pipes.ByteString.Parse as PBP | 163 | import qualified Pipes.ByteString.Parse as PBP |
164 | import qualified Pipes.Text.Internal as PE | ||
170 | import Pipes.Text.Parse ( | 165 | import Pipes.Text.Parse ( |
171 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) | 166 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) |
172 | import Pipes.Core (respond, Server') | 167 | import Pipes.Core (respond, Server') |
@@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) | |||
214 | {-# INLINABLE fromLazy #-} | 209 | {-# INLINABLE fromLazy #-} |
215 | 210 | ||
216 | -- | Stream text from 'stdin' | 211 | -- | Stream text from 'stdin' |
217 | stdin :: MonadIO m => Producer' Text m () | 212 | stdin :: MonadIO m => Producer' Text m (Producer ByteString m ()) |
218 | stdin = fromHandle IO.stdin | 213 | stdin = fromHandle IO.stdin |
219 | {-# INLINABLE stdin #-} | 214 | {-# INLINABLE stdin #-} |
220 | 215 | ||
221 | {-| Convert a 'IO.Handle' into a text stream using a text size | 216 | {-| Convert a 'IO.Handle' into a text stream using a text size |
222 | determined by the good sense of the text library. | 217 | determined by the good sense of the text library. |
223 | |||
224 | -} | 218 | -} |
225 | 219 | ||
226 | fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () | 220 | fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ()) |
227 | #if MIN_VERSION_text(0,11,4) | 221 | -- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)` |
228 | fromHandle h = go TE.streamDecodeUtf8 where | 222 | -- if only so that mistakes can be concentrated in one place. |
223 | -- This modifies something that was faster on an earlier iteration. | ||
224 | -- Note also that the `text` replacement system is being ignored; | ||
225 | -- with a replacement scheme one could have `Producer Text m ()` | ||
226 | -- the relation to the replacement business needs to be thought out. | ||
227 | -- The complicated type seems overmuch for the toy stdin above | ||
228 | fromHandle h = go PE.streamDecodeUtf8 B.empty where | ||
229 | act = B.hGetSome h defaultChunkSize | 229 | act = B.hGetSome h defaultChunkSize |
230 | go dec = do chunk <- liftIO act | 230 | go dec old = do chunk <- liftIO act |
231 | case dec chunk of | 231 | if B.null chunk |
232 | TE.Some text _ dec' -> do yield text | 232 | then if B.null old then return (return ()) |
233 | unless (B.null chunk) (go dec') | 233 | else return (yield old >> return ()) |
234 | else case dec chunk of | ||
235 | PE.Some text bs dec' -> | ||
236 | if T.null text then go dec' (B.append old bs) | ||
237 | else do yield text | ||
238 | go dec' B.empty | ||
239 | PE.Other text bs -> | ||
240 | if T.null text then return (do yield old | ||
241 | yield bs | ||
242 | PB.fromHandle h) | ||
243 | else do yield text | ||
244 | return (do yield bs | ||
245 | PB.fromHandle h) | ||
234 | {-# INLINE fromHandle#-} | 246 | {-# INLINE fromHandle#-} |
235 | -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as | 247 | -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as |
236 | -- the dedicated Text IO function 'hGetChunk' ; | 248 | -- the dedicated Text IO function 'hGetChunk' ; |
237 | -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" | 249 | -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" |
238 | -- runs the same as the conduit equivalent, only slightly slower | 250 | -- runs the same as the conduit equivalent, only slightly slower |
239 | -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" | 251 | -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" |
240 | #else | 252 | -- #else |
241 | fromHandle h = go where | 253 | -- fromHandle h = go where |
242 | go = do txt <- liftIO (T.hGetChunk h) | 254 | -- go = do txt <- liftIO (T.hGetChunk h) |
243 | unless (T.null txt) $ do yield txt | 255 | -- unless (T.null txt) $ do yield txt |
244 | go | 256 | -- go |
245 | {-# INLINABLE fromHandle#-} | 257 | -- {-# INLINABLE fromHandle#-} |
246 | #endif | 258 | -- #endif |
247 | {-| Stream text from a file using Pipes.Safe | 259 | {-| Stream text from a file using Pipes.Safe |
248 | 260 | ||
249 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout | 261 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout |
250 | MAIN = PUTSTRLN "HELLO WORLD" | 262 | MAIN = PUTSTRLN "HELLO WORLD" |
251 | -} | 263 | -} |
252 | 264 | ||
253 | readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () | 265 | readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ()) |
254 | readFile file = Safe.withFile file IO.ReadMode fromHandle | 266 | readFile file = Safe.withFile file IO.ReadMode fromHandle |
255 | {-# INLINABLE readFile #-} | 267 | {-# INLINABLE readFile #-} |
256 | 268 | ||
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n | |||
610 | count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) | 622 | count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) |
611 | {-# INLINABLE count #-} | 623 | {-# INLINABLE count #-} |
612 | 624 | ||
613 | #if MIN_VERSION_text(0,11,4) | ||
614 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded | 625 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded |
615 | -- into a Pipe of Text | 626 | -- into a Pipe of Text |
616 | decodeUtf8 | 627 | decodeUtf8 |
617 | :: Monad m | 628 | :: Monad m |
618 | => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 629 | => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
619 | decodeUtf8 = go TE.streamDecodeUtf8 | 630 | decodeUtf8 = decodeUtf8With Nothing |
620 | where go dec p = do | ||
621 | x <- lift (next p) | ||
622 | case x of | ||
623 | Left r -> return (return r) | ||
624 | Right (chunk, p') -> do | ||
625 | let TE.Some text l dec' = dec chunk | ||
626 | if B.null l | ||
627 | then do | ||
628 | yield text | ||
629 | go dec' p' | ||
630 | else return $ do | ||
631 | yield l | ||
632 | p' | ||
633 | {-# INLINEABLE decodeUtf8 #-} | 631 | {-# INLINEABLE decodeUtf8 #-} |
634 | 632 | ||
635 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded | 633 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded |
636 | -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ | 634 | -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ |
637 | -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" | 635 | -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" |
638 | decodeUtf8With | 636 | decodeUtf8With |
639 | :: Monad m | 637 | :: Monad m |
640 | => TE.OnDecodeError | 638 | => Maybe TE.OnDecodeError |
641 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 639 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
642 | decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) | 640 | decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where |
643 | where go dec p = do | 641 | go dec old p = do |
644 | x <- lift (next p) | 642 | x <- lift (next p) |
645 | case x of | 643 | case x of |
646 | Left r -> return (return r) | 644 | Left r -> if B.null old then return (return r) |
647 | Right (chunk, p') -> do | 645 | else return (do yield old |
648 | let TE.Some text l dec' = dec chunk | 646 | return r) |
649 | if B.null l | 647 | Right (chunk, p') -> |
650 | then do | 648 | case dec chunk of |
651 | yield text | 649 | PE.Some text l dec' -> |
652 | go dec' p' | 650 | if T.null text then go dec' (B.append old l) p' |
653 | else return $ do | 651 | else do yield text |
654 | yield l | 652 | go dec' B.empty p' |
655 | p' | 653 | PE.Other text bs -> |
654 | if T.null text then return (do yield old | ||
655 | yield bs | ||
656 | p') | ||
657 | else do yield text | ||
658 | return (do yield bs | ||
659 | p') | ||
656 | {-# INLINEABLE decodeUtf8With #-} | 660 | {-# INLINEABLE decodeUtf8With #-} |
657 | 661 | ||
658 | -- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise | 662 | |
659 | -- with any chunk that contains a sequence of bytes that is unreadable. Otherwise | ||
660 | -- only few bytes will only be moved from one chunk to the next before decoding. | ||
661 | pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r | ||
662 | pipeDecodeUtf8 = go TE.streamDecodeUtf8 | ||
663 | where go dec = do chunk <- await | ||
664 | case dec chunk of | ||
665 | TE.Some text l dec' -> do yield text | ||
666 | go dec' | ||
667 | {-# INLINEABLE pipeDecodeUtf8 #-} | ||
668 | |||
669 | -- | A simple pipe from 'ByteString' to 'Text' using a replacement function. | ||
670 | pipeDecodeUtf8With | ||
671 | :: Monad m | ||
672 | => TE.OnDecodeError | ||
673 | -> Pipe ByteString Text m r | ||
674 | pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) | ||
675 | where go dec = do chunk <- await | ||
676 | case dec chunk of | ||
677 | TE.Some text l dec' -> do yield text | ||
678 | go dec' | ||
679 | {-# INLINEABLE pipeDecodeUtf8With #-} | ||
680 | #endif | ||
681 | 663 | ||
682 | -- | Splits a 'Producer' after the given number of characters | 664 | -- | Splits a 'Producer' after the given number of characters |
683 | splitAt | 665 | splitAt |
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs new file mode 100644 index 0000000..05d9887 --- /dev/null +++ b/Pipes/Text/Internal.hs | |||
@@ -0,0 +1,157 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | ||
2 | UnliftedFFITypes #-} | ||
3 | -- This module lifts material from Brian O'Sullivan's text package | ||
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
5 | -- streamDecodeUtf8 | ||
6 | module Pipes.Text.Internal | ||
7 | ( Decoding(..) | ||
8 | , streamDecodeUtf8With | ||
9 | , streamDecodeUtf8 | ||
10 | ) where | ||
11 | |||
12 | import Control.Exception (evaluate, try) | ||
13 | #if __GLASGOW_HASKELL__ >= 702 | ||
14 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
15 | import Control.Monad.ST (ST, runST) | ||
16 | #else | ||
17 | import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST) | ||
18 | #endif | ||
19 | import Data.Bits ((.&.)) | ||
20 | import Data.ByteString as B | ||
21 | import Data.ByteString.Internal as B | ||
22 | import Data.Text () | ||
23 | import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) | ||
24 | import Data.Text.Internal (Text(..), safe, textP) | ||
25 | import Data.Word (Word8, Word32) | ||
26 | import Foreign.C.Types (CSize) | ||
27 | import Foreign.ForeignPtr (withForeignPtr) | ||
28 | import Foreign.Marshal.Utils (with) | ||
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
30 | import Foreign.Storable (Storable, peek, poke) | ||
31 | import GHC.Base hiding (ord) | ||
32 | import GHC.Word | ||
33 | import qualified Data.Text.Array as A | ||
34 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) | ||
35 | import GHC.Word (Word8(..), Word16(..), Word32(..)) | ||
36 | |||
37 | import Data.Text.Unsafe (unsafeDupablePerformIO) | ||
38 | |||
39 | #include "pipes_text_cbits.h" | ||
40 | |||
41 | -- | A stream oriented decoding result. | ||
42 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
43 | | Other Text ByteString | ||
44 | instance Show Decoding where | ||
45 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
46 | showString "Some " . showsPrec prec' t . | ||
47 | showChar ' ' . showsPrec prec' bs . | ||
48 | showString " _" | ||
49 | where prec = 10; prec' = prec + 1 | ||
50 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
51 | showString "Other " . showsPrec prec' t . | ||
52 | showChar ' ' . showsPrec prec' bs . | ||
53 | showString " _" | ||
54 | where prec = 10; prec' = prec + 1 | ||
55 | |||
56 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
57 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
58 | |||
59 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
60 | -- encoded text that is known to be valid. | ||
61 | -- | ||
62 | -- If the input contains any invalid UTF-8 data, an exception will be | ||
63 | -- thrown (either by this function or a continuation) that cannot be | ||
64 | -- caught in pure code. For more control over the handling of invalid | ||
65 | -- data, use 'streamDecodeUtf8With'. | ||
66 | streamDecodeUtf8 :: ByteString -> Decoding | ||
67 | streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) | ||
68 | |||
69 | -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 | ||
70 | -- encoded text. | ||
71 | streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding | ||
72 | streamDecodeUtf8With mErr = case mErr of | ||
73 | Nothing -> decodeWith False strictDecode | ||
74 | Just onErr -> decodeWith True onErr | ||
75 | where | ||
76 | -- We create a slightly larger than necessary buffer to accommodate a | ||
77 | -- potential surrogate pair started in the last buffer | ||
78 | decodeWith replace onErr = decodeChunk 0 0 | ||
79 | where | ||
80 | decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding | ||
81 | decodeChunk codepoint0 state0 bs@(PS fp off len) = | ||
82 | runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) | ||
83 | where | ||
84 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
85 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
86 | with (0::CSize) $ \destOffPtr -> | ||
87 | with codepoint0 $ \codepointPtr -> | ||
88 | with state0 $ \statePtr -> | ||
89 | with nullPtr $ \curPtrPtr -> | ||
90 | let end = ptr `plusPtr` (off + len) | ||
91 | loop curPtr = do | ||
92 | poke curPtrPtr curPtr | ||
93 | curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
94 | curPtrPtr end codepointPtr statePtr | ||
95 | state <- peek statePtr | ||
96 | case state of | ||
97 | UTF8_REJECT -> | ||
98 | -- We encountered an encoding error | ||
99 | if replace | ||
100 | then do | ||
101 | x <- peek curPtr' | ||
102 | case onErr desc (Just x) of | ||
103 | Nothing -> loop $ curPtr' `plusPtr` 1 | ||
104 | Just c -> do | ||
105 | destOff <- peek destOffPtr | ||
106 | w <- unsafeSTToIO $ | ||
107 | unsafeWrite dest (fromIntegral destOff) (safe c) | ||
108 | poke destOffPtr (destOff + fromIntegral w) | ||
109 | poke statePtr 0 | ||
110 | loop $ curPtr' `plusPtr` 1 | ||
111 | else do | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do | ||
114 | arr <- A.unsafeFreeze dest | ||
115 | return $! textP arr 0 (fromIntegral n) | ||
116 | lastPtr <- peek curPtrPtr | ||
117 | let left = lastPtr `minusPtr` curPtr | ||
118 | return $ Other chunkText (B.drop left bs) | ||
119 | _ -> do | ||
120 | -- We encountered the end of the buffer while decoding | ||
121 | n <- peek destOffPtr | ||
122 | codepoint <- peek codepointPtr | ||
123 | chunkText <- unsafeSTToIO $ do | ||
124 | arr <- A.unsafeFreeze dest | ||
125 | return $! textP arr 0 (fromIntegral n) | ||
126 | lastPtr <- peek curPtrPtr | ||
127 | let left = lastPtr `minusPtr` curPtr | ||
128 | return $ Some chunkText (B.drop left bs) | ||
129 | (decodeChunk codepoint state) | ||
130 | in loop (ptr `plusPtr` off) | ||
131 | desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" | ||
132 | |||
133 | ord :: Char -> Int | ||
134 | ord (C# c#) = I# (ord# c#) | ||
135 | {-# INLINE ord #-} | ||
136 | |||
137 | |||
138 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
139 | unsafeWrite marr i c | ||
140 | | n < 0x10000 = do | ||
141 | A.unsafeWrite marr i (fromIntegral n) | ||
142 | return 1 | ||
143 | | otherwise = do | ||
144 | A.unsafeWrite marr i lo | ||
145 | A.unsafeWrite marr (i+1) hi | ||
146 | return 2 | ||
147 | where n = ord c | ||
148 | m = n - 0x10000 | ||
149 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
150 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
151 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
152 | {-# INLINE unsafeWrite #-} | ||
153 | |||
154 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
155 | :: MutableByteArray# s -> Ptr CSize | ||
156 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
157 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||