aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-23 13:02:49 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-23 13:02:49 -0500
commit8c48280926efffc0ca52a5d9ca796d639d053379 (patch)
tree972ca8955b5581d634663424e973e56fa4487fe5 /Pipes
parent8853a440e37523bae8cb46827d0d2d356bad5c46 (diff)
downloadtext-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.gz
text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.tar.zst
text-pipes-8c48280926efffc0ca52a5d9ca796d639d053379.zip
variant using text internals in place of text streamDecodeUtf8
Diffstat (limited to 'Pipes')
-rw-r--r--Pipes/Text.hs136
-rw-r--r--Pipes/Text/Internal.hs157
2 files changed, 216 insertions, 77 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index a5859a3..6845dd3 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -81,10 +81,6 @@ module Pipes.Text (
81 filter, 81 filter,
82 scan, 82 scan,
83 encodeUtf8, 83 encodeUtf8,
84#if MIN_VERSION_text(0,11,4)
85 pipeDecodeUtf8,
86 pipeDecodeUtf8With,
87#endif
88 pack, 84 pack,
89 unpack, 85 unpack,
90 toCaseFold, 86 toCaseFold,
@@ -119,10 +115,8 @@ module Pipes.Text (
119 group, 115 group,
120 lines, 116 lines,
121 words, 117 words,
122#if MIN_VERSION_text(0,11,4)
123 decodeUtf8, 118 decodeUtf8,
124 decodeUtf8With, 119 decodeUtf8With,
125#endif
126 -- * Transformations 120 -- * Transformations
127 intersperse, 121 intersperse,
128 122
@@ -167,6 +161,7 @@ import qualified GHC.IO.Exception as G
167import Pipes 161import Pipes
168import qualified Pipes.ByteString as PB 162import qualified Pipes.ByteString as PB
169import qualified Pipes.ByteString.Parse as PBP 163import qualified Pipes.ByteString.Parse as PBP
164import qualified Pipes.Text.Internal as PE
170import Pipes.Text.Parse ( 165import Pipes.Text.Parse (
171 nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) 166 nextChar, drawChar, unDrawChar, peekChar, isEndOfChars )
172import Pipes.Core (respond, Server') 167import Pipes.Core (respond, Server')
@@ -214,43 +209,60 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
214{-# INLINABLE fromLazy #-} 209{-# INLINABLE fromLazy #-}
215 210
216-- | Stream text from 'stdin' 211-- | Stream text from 'stdin'
217stdin :: MonadIO m => Producer' Text m () 212stdin :: MonadIO m => Producer' Text m (Producer ByteString m ())
218stdin = fromHandle IO.stdin 213stdin = fromHandle IO.stdin
219{-# INLINABLE stdin #-} 214{-# INLINABLE stdin #-}
220 215
221{-| Convert a 'IO.Handle' into a text stream using a text size 216{-| Convert a 'IO.Handle' into a text stream using a text size
222 determined by the good sense of the text library. 217 determined by the good sense of the text library.
223
224-} 218-}
225 219
226fromHandle :: MonadIO m => IO.Handle -> Producer' Text m () 220fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ())
227#if MIN_VERSION_text(0,11,4) 221-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)`
228fromHandle h = go TE.streamDecodeUtf8 where 222-- if only so that mistakes can be concentrated in one place.
223-- This modifies something that was faster on an earlier iteration.
224-- Note also that the `text` replacement system is being ignored;
225-- with a replacement scheme one could have `Producer Text m ()`
226-- the relation to the replacement business needs to be thought out.
227-- The complicated type seems overmuch for the toy stdin above
228fromHandle h = go PE.streamDecodeUtf8 B.empty where
229 act = B.hGetSome h defaultChunkSize 229 act = B.hGetSome h defaultChunkSize
230 go dec = do chunk <- liftIO act 230 go dec old = do chunk <- liftIO act
231 case dec chunk of 231 if B.null chunk
232 TE.Some text _ dec' -> do yield text 232 then if B.null old then return (return ())
233 unless (B.null chunk) (go dec') 233 else return (yield old >> return ())
234 else case dec chunk of
235 PE.Some text bs dec' ->
236 if T.null text then go dec' (B.append old bs)
237 else do yield text
238 go dec' B.empty
239 PE.Other text bs ->
240 if T.null text then return (do yield old
241 yield bs
242 PB.fromHandle h)
243 else do yield text
244 return (do yield bs
245 PB.fromHandle h)
234{-# INLINE fromHandle#-} 246{-# INLINE fromHandle#-}
235-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as 247-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as
236-- the dedicated Text IO function 'hGetChunk' ; 248-- the dedicated Text IO function 'hGetChunk' ;
237-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut" 249-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
238-- runs the same as the conduit equivalent, only slightly slower 250-- runs the same as the conduit equivalent, only slightly slower
239-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut" 251-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
240#else 252-- #else
241fromHandle h = go where 253-- fromHandle h = go where
242 go = do txt <- liftIO (T.hGetChunk h) 254-- go = do txt <- liftIO (T.hGetChunk h)
243 unless (T.null txt) $ do yield txt 255-- unless (T.null txt) $ do yield txt
244 go 256-- go
245{-# INLINABLE fromHandle#-} 257-- {-# INLINABLE fromHandle#-}
246#endif 258-- #endif
247{-| Stream text from a file using Pipes.Safe 259{-| Stream text from a file using Pipes.Safe
248 260
249>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout 261>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
250MAIN = PUTSTRLN "HELLO WORLD" 262MAIN = PUTSTRLN "HELLO WORLD"
251-} 263-}
252 264
253readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m () 265readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ())
254readFile file = Safe.withFile file IO.ReadMode fromHandle 266readFile file = Safe.withFile file IO.ReadMode fromHandle
255{-# INLINABLE readFile #-} 267{-# INLINABLE readFile #-}
256 268
@@ -610,74 +622,44 @@ count :: (Monad m, Num n) => Text -> Producer Text m () -> m n
610count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c)) 622count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
611{-# INLINABLE count #-} 623{-# INLINABLE count #-}
612 624
613#if MIN_VERSION_text(0,11,4)
614-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded 625-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
615-- into a Pipe of Text 626-- into a Pipe of Text
616decodeUtf8 627decodeUtf8
617 :: Monad m 628 :: Monad m
618 => Producer ByteString m r -> Producer Text m (Producer ByteString m r) 629 => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
619decodeUtf8 = go TE.streamDecodeUtf8 630decodeUtf8 = decodeUtf8With Nothing
620 where go dec p = do
621 x <- lift (next p)
622 case x of
623 Left r -> return (return r)
624 Right (chunk, p') -> do
625 let TE.Some text l dec' = dec chunk
626 if B.null l
627 then do
628 yield text
629 go dec' p'
630 else return $ do
631 yield l
632 p'
633{-# INLINEABLE decodeUtf8 #-} 631{-# INLINEABLE decodeUtf8 #-}
634 632
635-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded 633-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
636-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@ 634-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
637-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\" 635-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
638decodeUtf8With 636decodeUtf8With
639 :: Monad m 637 :: Monad m
640 => TE.OnDecodeError 638 => Maybe TE.OnDecodeError
641 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) 639 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
642decodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr) 640decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
643 where go dec p = do 641 go dec old p = do
644 x <- lift (next p) 642 x <- lift (next p)
645 case x of 643 case x of
646 Left r -> return (return r) 644 Left r -> if B.null old then return (return r)
647 Right (chunk, p') -> do 645 else return (do yield old
648 let TE.Some text l dec' = dec chunk 646 return r)
649 if B.null l 647 Right (chunk, p') ->
650 then do 648 case dec chunk of
651 yield text 649 PE.Some text l dec' ->
652 go dec' p' 650 if T.null text then go dec' (B.append old l) p'
653 else return $ do 651 else do yield text
654 yield l 652 go dec' B.empty p'
655 p' 653 PE.Other text bs ->
654 if T.null text then return (do yield old
655 yield bs
656 p')
657 else do yield text
658 return (do yield bs
659 p')
656{-# INLINEABLE decodeUtf8With #-} 660{-# INLINEABLE decodeUtf8With #-}
657 661
658-- | A simple pipe from 'ByteString' to 'Text'; a decoding error will arise 662
659-- with any chunk that contains a sequence of bytes that is unreadable. Otherwise
660-- only few bytes will only be moved from one chunk to the next before decoding.
661pipeDecodeUtf8 :: Monad m => Pipe ByteString Text m r
662pipeDecodeUtf8 = go TE.streamDecodeUtf8
663 where go dec = do chunk <- await
664 case dec chunk of
665 TE.Some text l dec' -> do yield text
666 go dec'
667{-# INLINEABLE pipeDecodeUtf8 #-}
668
669-- | A simple pipe from 'ByteString' to 'Text' using a replacement function.
670pipeDecodeUtf8With
671 :: Monad m
672 => TE.OnDecodeError
673 -> Pipe ByteString Text m r
674pipeDecodeUtf8With onErr = go (TE.streamDecodeUtf8With onErr)
675 where go dec = do chunk <- await
676 case dec chunk of
677 TE.Some text l dec' -> do yield text
678 go dec'
679{-# INLINEABLE pipeDecodeUtf8With #-}
680#endif
681 663
682-- | Splits a 'Producer' after the given number of characters 664-- | Splits a 'Producer' after the given number of characters
683splitAt 665splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
new file mode 100644
index 0000000..05d9887
--- /dev/null
+++ b/Pipes/Text/Internal.hs
@@ -0,0 +1,157 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-}
3-- This module lifts material from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8
6module Pipes.Text.Internal
7 ( Decoding(..)
8 , streamDecodeUtf8With
9 , streamDecodeUtf8
10 ) where
11
12import Control.Exception (evaluate, try)
13#if __GLASGOW_HASKELL__ >= 702
14import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15import Control.Monad.ST (ST, runST)
16#else
17import Control.Monad.ST (unsafeIOToST, unsafeSTToIO, ST, runST)
18#endif
19import Data.Bits ((.&.))
20import Data.ByteString as B
21import Data.ByteString.Internal as B
22import Data.Text ()
23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24import Data.Text.Internal (Text(..), safe, textP)
25import Data.Word (Word8, Word32)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base hiding (ord)
32import GHC.Word
33import qualified Data.Text.Array as A
34import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
35import GHC.Word (Word8(..), Word16(..), Word32(..))
36
37import Data.Text.Unsafe (unsafeDupablePerformIO)
38
39#include "pipes_text_cbits.h"
40
41-- | A stream oriented decoding result.
42data Decoding = Some Text ByteString (ByteString -> Decoding)
43 | Other Text ByteString
44instance Show Decoding where
45 showsPrec d (Some t bs _) = showParen (d > prec) $
46 showString "Some " . showsPrec prec' t .
47 showChar ' ' . showsPrec prec' bs .
48 showString " _"
49 where prec = 10; prec' = prec + 1
50 showsPrec d (Other t bs) = showParen (d > prec) $
51 showString "Other " . showsPrec prec' t .
52 showChar ' ' . showsPrec prec' bs .
53 showString " _"
54 where prec = 10; prec' = prec + 1
55
56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58
59-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60-- encoded text that is known to be valid.
61--
62-- If the input contains any invalid UTF-8 data, an exception will be
63-- thrown (either by this function or a continuation) that cannot be
64-- caught in pure code. For more control over the handling of invalid
65-- data, use 'streamDecodeUtf8With'.
66streamDecodeUtf8 :: ByteString -> Decoding
67streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode)
68
69-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
70-- encoded text.
71streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding
72streamDecodeUtf8With mErr = case mErr of
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
75 where
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
79 where
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83 where
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len)
91 loop curPtr = do
92 poke curPtrPtr curPtr
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
94 curPtrPtr end codepointPtr statePtr
95 state <- peek statePtr
96 case state of
97 UTF8_REJECT ->
98 -- We encountered an encoding error
99 if replace
100 then do
101 x <- peek curPtr'
102 case onErr desc (Just x) of
103 Nothing -> loop $ curPtr' `plusPtr` 1
104 Just c -> do
105 destOff <- peek destOffPtr
106 w <- unsafeSTToIO $
107 unsafeWrite dest (fromIntegral destOff) (safe c)
108 poke destOffPtr (destOff + fromIntegral w)
109 poke statePtr 0
110 loop $ curPtr' `plusPtr` 1
111 else do
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
119 _ -> do
120 -- We encountered the end of the buffer while decoding
121 n <- peek destOffPtr
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132
133ord :: Char -> Int
134ord (C# c#) = I# (ord# c#)
135{-# INLINE ord #-}
136
137
138unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139unsafeWrite marr i c
140 | n < 0x10000 = do
141 A.unsafeWrite marr i (fromIntegral n)
142 return 1
143 | otherwise = do
144 A.unsafeWrite marr i lo
145 A.unsafeWrite marr (i+1) hi
146 return 2
147 where n = ord c
148 m = n - 0x10000
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
150 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
151 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
152{-# INLINE unsafeWrite #-}
153
154foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
155 :: MutableByteArray# s -> Ptr CSize
156 -> Ptr (Ptr Word8) -> Ptr Word8
157 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file