diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-23 13:02:49 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2013-12-23 13:02:49 -0500 |
commit | 8c48280926efffc0ca52a5d9ca796d639d053379 (patch) | |
tree | 972ca8955b5581d634663424e973e56fa4487fe5 /Pipes/Text.hs | |
parent | 8853a440e37523bae8cb46827d0d2d356bad5c46 (diff) | |
download | text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip |
variant using text internals in place of text streamDecodeUtf8
Diffstat (limited to 'Pipes/Text.hs')
-rw-r--r-- | Pipes/Text.hs | 136 |
1 files changed, 59 insertions, 77 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index a5859a3..6845dd3 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -81,10 +81,6 @@ module Pipes.Text ( | |||
81 | filter, | 81 | filter, |
82 | scan, | 82 | scan, |
83 | encodeUtf8, | 83 | encodeUtf8, |
84 | #if MIN_VERSION_text(0,11,4) | ||
85 | pipeDecodeUtf8, | ||
86 | pipeDecodeUtf8With, | ||
87 | #endif | ||
88 | pack, | 84 | pack, |
89 | unpack, | 85 | unpack, |
90 | toCaseFold, | 86 | toCaseFold, |
@@ -119,10 +115,8 @@ module Pipes.Text ( | |||
119 | group, | 115 | group, |
120 | lines, | 116 | lines, |
121 | words, | 117 | words, |
122 | #if MIN_VERSION_text(0,11,4) | ||
123 | decodeUtf8, | 118 | decodeUtf8, |
124 | decodeUtf8With, | 119 | decodeUtf8With, |
125 | #endif | ||
126 | -- * Transformations | 120 | -- * Transformations |
127 | intersperse, | 121 | intersperse, |
128 | 122 | ||
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G | |||
167 | import Pipes | 161 | import Pipes |
168 | import qualified Pipes.ByteString as PB | 162 | import qualified Pipes.ByteString as PB |
169 | import qualified Pipes.ByteString.Parse as PBP | 163 | import qualified Pipes.ByteString.Parse as PBP |
164 | import qualified Pipes.Text.Internal as PE | ||
170 | import Pipes.Text.Parse ( | 165 | import Pipes.Text.Parse ( |
171 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) | 166 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) |
172 | import Pipes.Core (respond, Server') | 167 | import Pipes.Core (respond, Server') |
@@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) | |||
214 | {-# INLINABLE fromLazy #-} | 209 | {-# INLINABLE fromLazy #-} |
215 | 210 | ||
216 | -- | Stream text from 'stdin' | 211 | -- | Stream text from 'stdin' |
217 | stdin :: MonadIO m => Producer' Text m () | 212 | stdin :: MonadIO m => Producer' Text m (Producer ByteString m ()) |
218 | stdin = fromHandle IO.stdin | 213 | stdin = fromHandle IO.stdin |
219 | {-# INLINABLE stdin #-} | 214 | {-# INLINABLE stdin #-} |
220 | 215 | ||
221 | {-| Convert a 'IO.Handle' into a text stream using a text size | 216 | {-| Convert a 'IO.Handle' into a text stream using a text size |
222 | determined by the good sense of the text library. | 217 | determined by the good sense of the text library. |
223 | |||
224 | -} | 218 | -} |
225 | 219 | ||
226 | fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () | 220 | fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ()) |
227 | #if MIN_VERSION_text(0,11,4) | 221 | -- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)` |
228 | fromHandle h = go TE.streamDecodeUtf8 where | 222 | -- if only so that mistakes can be concentrated in one place. |
223 | -- This modifies something that was faster on an earlier iteration. | ||
224 | -- Note also that the `text` replacement system is being ignored; | ||
225 | -- with a replacement scheme one could have `Producer Text m ()` | ||
226 | -- the relation to the replacement business needs to be thought out. | ||
227 | -- The complicated type seems overmuch for the toy stdin above | ||
228 | fromHandle h = go PE.streamDecodeUtf8 B.empty where | ||
229 | act = B.hGetSome h defaultChunkSize | 229 | act = B.hGetSome h defaultChunkSize |
230 | go dec = do chunk <- liftIO act | 230 | go dec old = do chunk <- liftIO act |
231 | case dec chunk of | 231 | if B.null chunk |
232 | TE.Some text _ dec' -> do yield text | 232 | then if B.null old then return (return ()) |
233 | unless (B.null chunk) (go dec') | 233 | else return (yield old >> return ()) |
234 | else case dec chunk of | ||
235 | PE.Some text bs dec' -> | ||
236 | if T.null text then go dec' (B.append old bs) | ||
237 | else do yield text | ||
238 | go dec' B.empty | ||
239 | PE.Other text bs -> | ||
240 | if T.null text then return (do yield old | ||
241 | yield bs | ||
242 | PB.fromHandle h) | ||
243 | else do yield text | ||
244 | return (do yield bs | ||
245 | PB.fromHandle h) | ||
234 | {-# INLINE fromHandle#-} | 246 | {-# INLINE fromHandle#-} |
235 | -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as | 247 | -- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as |
236 | -- the dedicated Text IO function 'hGetChunk' ; | 248 | -- the dedicated Text IO function 'hGetChunk' ; |
237 | -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" | 249 | -- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" |
238 | -- runs the same as the conduit equivalent, only slightly slower | 250 | -- runs the same as the conduit equivalent, only slightly slower |
239 | -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" | 251 | -- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" |
240 | #else | 252 | -- #else |
241 | fromHandle h = go where | 253 | -- fromHandle h = go where |
242 | go = do txt <- liftIO (T.hGetChunk h) | 254 | -- go = do txt <- liftIO (T.hGetChunk h) |
243 | unless (T.null txt) $ do yield txt | 255 | -- unless (T.null txt) $ do yield txt |
244 | go | 256 | -- go |
245 | {-# INLINABLE fromHandle#-} | 257 | -- {-# INLINABLE fromHandle#-} |
246 | #endif | 258 | -- #endif |
247 | {-| Stream text from a file using Pipes.Safe | 259 | {-| Stream text from a file using Pipes.Safe |
248 | 260 | ||
249 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout | 261 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout |
250 | MAIN = PUTSTRLN "HELLO WORLD" | 262 | MAIN = PUTSTRLN "HELLO WORLD" |
251 | -} | 263 | -} |
252 | 264 | ||
253 | readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () | 265 | readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ()) |
254 | readFile file = Safe.withFile file IO.ReadMode fromHandle | 266 | readFile file = Safe.withFile file IO.ReadMode fromHandle |
255 | {-# INLINABLE readFile #-} | 267 | {-# INLINABLE readFile #-} |
256 | 268 | ||
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n | |||
610 | count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) | 622 | count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) |
611 | {-# INLINABLE count #-} | 623 | {-# INLINABLE count #-} |
612 | 624 | ||
613 | #if MIN_VERSION_text(0,11,4) | ||
614 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded | 625 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded |
615 | -- into a Pipe of Text | 626 | -- into a Pipe of Text |
616 | decodeUtf8 | 627 | decodeUtf8 |
617 | :: Monad m | 628 | :: Monad m |
618 | => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 629 | => Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
619 | decodeUtf8 = go TE.streamDecodeUtf8 | 630 | decodeUtf8 = decodeUtf8With Nothing |
620 | where go dec p = do | ||
621 | x <- lift (next p) | ||
622 | case x of | ||
623 | Left r -> return (return r) | ||
624 | Right (chunk, p') -> do | ||
625 | let TE.Some text l dec' = dec chunk | ||
626 | if B.null l | ||
627 | then do | ||
628 | yield text | ||
629 | go dec' p' | ||
630 | else return $ do | ||
631 | yield l | ||
632 | p' | ||
633 | {-# INLINEABLE decodeUtf8 #-} | 631 | {-# INLINEABLE decodeUtf8 #-} |
634 | 632 | ||
635 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded | 633 | -- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded |
636 | -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ | 634 | -- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ |
637 | -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" | 635 | -- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" |
638 | decodeUtf8With | 636 | decodeUtf8With |
639 | :: Monad m | 637 | :: Monad m |
640 | => TE.OnDecodeError | 638 | => Maybe TE.OnDecodeError |
641 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | 639 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) |
642 | decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) | 640 | decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where |
643 | where go dec p = do | 641 | go dec old p = do |
644 | x <- lift (next p) | 642 | x <- lift (next p) |
645 | case x of | 643 | case x of |
646 | Left r -> return (return r) | 644 | Left r -> if B.null old then return (return r) |
647 | Right (chunk, p') -> do | 645 | else return (do yield old |
648 | let TE.Some text l dec' = dec chunk | 646 | return r) |
649 | if B.null l | 647 | Right (chunk, p') -> |
650 | then do | 648 | case dec chunk of |
651 | yield text | 649 | PE.Some text l dec' -> |
652 | go dec' p' | 650 | if T.null text then go dec' (B.append old l) p' |
653 | else return $ do | 651 | else do yield text |
654 | yield l | 652 | go dec' B.empty p' |
655 | p' | 653 | PE.Other text bs -> |
654 | if T.null text then return (do yield old | ||
655 | yield bs | ||
656 | p') | ||
657 | else do yield text | ||
658 | return (do yield bs | ||
659 | p') | ||
656 | {-# INLINEABLE decodeUtf8With #-} | 660 | {-# INLINEABLE decodeUtf8With #-} |
657 | 661 | ||
658 | -- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise | 662 | |
659 | -- with any chunk that contains a sequence of bytes that is unreadable. Otherwise | ||
660 | -- only few bytes will only be moved from one chunk to the next before decoding. | ||
661 | pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r | ||
662 | pipeDecodeUtf8 = go TE.streamDecodeUtf8 | ||
663 | where go dec = do chunk <- await | ||
664 | case dec chunk of | ||
665 | TE.Some text l dec' -> do yield text | ||
666 | go dec' | ||
667 | {-# INLINEABLE pipeDecodeUtf8 #-} | ||
668 | |||
669 | -- | A simple pipe from 'ByteString' to 'Text' using a replacement function. | ||
670 | pipeDecodeUtf8With | ||
671 | :: Monad m | ||
672 | => TE.OnDecodeError | ||
673 | -> Pipe ByteString Text m r | ||
674 | pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) | ||
675 | where go dec = do chunk <- await | ||
676 | case dec chunk of | ||
677 | TE.Some text l dec' -> do yield text | ||
678 | go dec' | ||
679 | {-# INLINEABLE pipeDecodeUtf8With #-} | ||
680 | #endif | ||
681 | 663 | ||
682 | -- | Splits a 'Producer' after the given number of characters | 664 | -- | Splits a 'Producer' after the given number of characters |
683 | splitAt | 665 | splitAt |