aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-25 22:25:07 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2013-12-25 22:25:07 -0500
commitc9d1c945a4343d756533b85060c35c04be0c8b02 (patch)
tree0c31ae2e13002c1311ae3cc1e15608750c5a0a9c
parent8c48280926efffc0ca52a5d9ca796d639d053379 (diff)
downloadtext-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.gz
text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.tar.zst
text-pipes-c9d1c945a4343d756533b85060c35c04be0c8b02.zip
scrap character replacement; simplify
-rw-r--r--Pipes/Text.hs96
-rw-r--r--Pipes/Text/Internal.hs118
-rw-r--r--test/Test.hs60
3 files changed, 103 insertions, 171 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 6845dd3..d62aee7 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -1,4 +1,4 @@
1{-# LANGUAGE RankNTypes, TypeFamilies, CPP #-} 1{-# LANGUAGE RankNTypes, TypeFamilies, NoMonomorphismRestriction #-}
2 2
3{-| This module provides @pipes@ utilities for \"text streams\", which are 3{-| This module provides @pipes@ utilities for \"text streams\", which are
4 streams of 'Text' chunks. The individual chunks are uniformly @strict@, but 4 streams of 'Text' chunks. The individual chunks are uniformly @strict@, but
@@ -116,7 +116,6 @@ module Pipes.Text (
116 lines, 116 lines,
117 words, 117 words,
118 decodeUtf8, 118 decodeUtf8,
119 decodeUtf8With,
120 -- * Transformations 119 -- * Transformations
121 intersperse, 120 intersperse,
122 121
@@ -209,7 +208,7 @@ fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
209{-# INLINABLE fromLazy #-} 208{-# INLINABLE fromLazy #-}
210 209
211-- | Stream text from 'stdin' 210-- | Stream text from 'stdin'
212stdin :: MonadIO m => Producer' Text m (Producer ByteString m ()) 211stdin :: MonadIO m => Producer Text m (Producer ByteString m ())
213stdin = fromHandle IO.stdin 212stdin = fromHandle IO.stdin
214{-# INLINABLE stdin #-} 213{-# INLINABLE stdin #-}
215 214
@@ -217,52 +216,17 @@ stdin = fromHandle IO.stdin
217 determined by the good sense of the text library. 216 determined by the good sense of the text library.
218-} 217-}
219 218
220fromHandle :: MonadIO m => IO.Handle -> Producer' Text m (Producer ByteString m ()) 219fromHandle :: MonadIO m => IO.Handle -> Producer Text m (Producer ByteString m ())
221-- TODO: this should perhaps just be `decodeUtf8 (PB.fromHandle h)` 220fromHandle h = decodeUtf8 (PB.fromHandle h)
222-- if only so that mistakes can be concentrated in one place.
223-- This modifies something that was faster on an earlier iteration.
224-- Note also that the `text` replacement system is being ignored;
225-- with a replacement scheme one could have `Producer Text m ()`
226-- the relation to the replacement business needs to be thought out.
227-- The complicated type seems overmuch for the toy stdin above
228fromHandle h = go PE.streamDecodeUtf8 B.empty where
229 act = B.hGetSome h defaultChunkSize
230 go dec old = do chunk <- liftIO act
231 if B.null chunk
232 then if B.null old then return (return ())
233 else return (yield old >> return ())
234 else case dec chunk of
235 PE.Some text bs dec' ->
236 if T.null text then go dec' (B.append old bs)
237 else do yield text
238 go dec' B.empty
239 PE.Other text bs ->
240 if T.null text then return (do yield old
241 yield bs
242 PB.fromHandle h)
243 else do yield text
244 return (do yield bs
245 PB.fromHandle h)
246{-# INLINE fromHandle#-} 221{-# INLINE fromHandle#-}
247-- bytestring fromHandle + streamDecodeUtf8 is 3 times as fast as 222
248-- the dedicated Text IO function 'hGetChunk' ;
249-- this way "runEffect $ PT.fromHandle hIn >-> PT.toHandle hOut"
250-- runs the same as the conduit equivalent, only slightly slower
251-- than "runEffect $ PB.fromHandle hIn >-> PB.toHandle hOut"
252-- #else
253-- fromHandle h = go where
254-- go = do txt <- liftIO (T.hGetChunk h)
255-- unless (T.null txt) $ do yield txt
256-- go
257-- {-# INLINABLE fromHandle#-}
258-- #endif
259{-| Stream text from a file using Pipes.Safe 223{-| Stream text from a file using Pipes.Safe
260 224
261>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout 225>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
262MAIN = PUTSTRLN "HELLO WORLD" 226MAIN = PUTSTRLN "HELLO WORLD"
263-} 227-}
264 228
265readFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Producer' Text m (Producer ByteString m ()) 229readFile :: (MonadSafe m) => FilePath -> Producer Text m (Producer ByteString m ())
266readFile file = Safe.withFile file IO.ReadMode fromHandle 230readFile file = Safe.withFile file IO.ReadMode fromHandle
267{-# INLINABLE readFile #-} 231{-# INLINABLE readFile #-}
268 232
@@ -338,7 +302,7 @@ toHandle h = for cat (liftIO . T.hPutStr h)
338 302
339 303
340-- | Stream text into a file. Uses @pipes-safe@. 304-- | Stream text into a file. Uses @pipes-safe@.
341writeFile :: (MonadSafe m, Base m ~ IO) => FilePath -> Consumer' Text m () 305writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
342writeFile file = Safe.withFile file IO.WriteMode toHandle 306writeFile file = Safe.withFile file IO.WriteMode toHandle
343 307
344-- | Apply a transformation to each 'Char' in the stream 308-- | Apply a transformation to each 'Char' in the stream
@@ -624,42 +588,18 @@ count c p = P.fold (+) 0 id (p >-> P.map (fromIntegral . T.count c))
624 588
625-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded 589-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
626-- into a Pipe of Text 590-- into a Pipe of Text
627decodeUtf8
628 :: Monad m
629 => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
630decodeUtf8 = decodeUtf8With Nothing
631{-# INLINEABLE decodeUtf8 #-}
632
633-- | Transform a Pipe of 'ByteString's expected to be UTF-8 encoded
634-- into a Pipe of Text with a replacement function of type @String -> Maybe Word8 -> Maybe Char@
635-- E.g. 'Data.Text.Encoding.Error.lenientDecode', which simply replaces bad bytes with \"�\"
636decodeUtf8With
637 :: Monad m
638 => Maybe TE.OnDecodeError
639 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
640decodeUtf8With onErr = go (PE.streamDecodeUtf8With onErr) B.empty where
641 go dec old p = do
642 x <- lift (next p)
643 case x of
644 Left r -> if B.null old then return (return r)
645 else return (do yield old
646 return r)
647 Right (chunk, p') ->
648 case dec chunk of
649 PE.Some text l dec' ->
650 if T.null text then go dec' (B.append old l) p'
651 else do yield text
652 go dec' B.empty p'
653 PE.Other text bs ->
654 if T.null text then return (do yield old
655 yield bs
656 p')
657 else do yield text
658 return (do yield bs
659 p')
660{-# INLINEABLE decodeUtf8With #-}
661
662 591
592decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
593decodeUtf8 = go PE.streamDecodeUtf8 where
594 go dec0 p = do
595 x <- lift (next p)
596 case x of Left r -> return (return r)
597 Right (chunk, p') ->
598 case dec0 chunk of PE.Some text _ dec -> do yield text
599 go dec p'
600 PE.Other text bs -> do yield text
601 return (do yield bs
602 p')
663 603
664-- | Splits a 'Producer' after the given number of characters 604-- | Splits a 'Producer' after the given number of characters
665splitAt 605splitAt
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 05d9887..73d6fa4 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -1,11 +1,10 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, 1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
2 UnliftedFFITypes #-} 2 UnliftedFFITypes #-}
3-- This module lifts material from Brian O'Sullivan's text package 3-- This module lifts assorted materials from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate 4-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8 5-- streamDecodeUtf8
6module Pipes.Text.Internal 6module Pipes.Text.Internal
7 ( Decoding(..) 7 ( Decoding(..)
8 , streamDecodeUtf8With
9 , streamDecodeUtf8 8 , streamDecodeUtf8
10 ) where 9 ) where
11 10
@@ -20,6 +19,7 @@ import Data.Bits ((.&.))
20import Data.ByteString as B 19import Data.ByteString as B
21import Data.ByteString.Internal as B 20import Data.ByteString.Internal as B
22import Data.Text () 21import Data.Text ()
22import qualified Data.Text as T
23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) 23import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
24import Data.Text.Internal (Text(..), safe, textP) 24import Data.Text.Internal (Text(..), safe, textP)
25import Data.Word (Word8, Word32) 25import Data.Word (Word8, Word32)
@@ -56,94 +56,52 @@ instance Show Decoding where
56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) 56newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) 57newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
58 58
59-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
60-- encoded text that is known to be valid.
61--
62-- If the input contains any invalid UTF-8 data, an exception will be
63-- thrown (either by this function or a continuation) that cannot be
64-- caught in pure code. For more control over the handling of invalid
65-- data, use 'streamDecodeUtf8With'.
66streamDecodeUtf8 :: ByteString -> Decoding 59streamDecodeUtf8 :: ByteString -> Decoding
67streamDecodeUtf8 = streamDecodeUtf8With (Just strictDecode) 60streamDecodeUtf8 = decodeChunk B.empty 0 0
68 61
69-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 62decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
70-- encoded text. 63decodeChunk old codepoint0 state0 bs@(PS fp off len) =
71streamDecodeUtf8With :: Maybe OnDecodeError -> ByteString -> Decoding 64 runST $ do marray <- A.new (len+1)
72streamDecodeUtf8With mErr = case mErr of 65 unsafeIOToST (decodeChunkToBuffer marray)
73 Nothing -> decodeWith False strictDecode
74 Just onErr -> decodeWith True onErr
75 where
76 -- We create a slightly larger than necessary buffer to accommodate a
77 -- potential surrogate pair started in the last buffer
78 decodeWith replace onErr = decodeChunk 0 0
79 where
80 decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
81 decodeChunk codepoint0 state0 bs@(PS fp off len) =
82 runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
83 where 66 where
84 decodeChunkToBuffer :: A.MArray s -> IO Decoding 67 decodeChunkToBuffer :: A.MArray s -> IO Decoding
85 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> 68 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
86 with (0::CSize) $ \destOffPtr -> 69 with (0::CSize) $ \destOffPtr ->
87 with codepoint0 $ \codepointPtr -> 70 with codepoint0 $ \codepointPtr ->
88 with state0 $ \statePtr -> 71 with state0 $ \statePtr ->
89 with nullPtr $ \curPtrPtr -> 72 with nullPtr $ \curPtrPtr ->
90 let end = ptr `plusPtr` (off + len) 73 do let end = ptr `plusPtr` (off + len)
91 loop curPtr = do 74 curPtr = ptr `plusPtr` off
92 poke curPtrPtr curPtr 75 poke curPtrPtr curPtr
93 curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr 76 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
94 curPtrPtr end codepointPtr statePtr 77 state <- peek statePtr
95 state <- peek statePtr 78 lastPtr <- peek curPtrPtr
96 case state of 79 codepoint <- peek codepointPtr
97 UTF8_REJECT -> 80 n <- peek destOffPtr
98 -- We encountered an encoding error 81 chunkText <- mkText dest n
99 if replace 82 let left = lastPtr `minusPtr` curPtr
100 then do 83 remaining = B.drop left bs
101 x <- peek curPtr' 84 accum = if T.null chunkText then B.append old remaining else remaining
102 case onErr desc (Just x) of 85 return $ case state of
103 Nothing -> loop $ curPtr' `plusPtr` 1 86 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
104 Just c -> do 87 _ -> Some chunkText accum (decodeChunk accum codepoint state)
105 destOff <- peek destOffPtr 88
106 w <- unsafeSTToIO $ 89
107 unsafeWrite dest (fromIntegral destOff) (safe c) 90mkText :: A.MArray s -> CSize -> IO Text
108 poke destOffPtr (destOff + fromIntegral w) 91mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
109 poke statePtr 0 92 return $! textP arr 0 (fromIntegral n)
110 loop $ curPtr' `plusPtr` 1
111 else do
112 n <- peek destOffPtr
113 chunkText <- unsafeSTToIO $ do
114 arr <- A.unsafeFreeze dest
115 return $! textP arr 0 (fromIntegral n)
116 lastPtr <- peek curPtrPtr
117 let left = lastPtr `minusPtr` curPtr
118 return $ Other chunkText (B.drop left bs)
119 _ -> do
120 -- We encountered the end of the buffer while decoding
121 n <- peek destOffPtr
122 codepoint <- peek codepointPtr
123 chunkText <- unsafeSTToIO $ do
124 arr <- A.unsafeFreeze dest
125 return $! textP arr 0 (fromIntegral n)
126 lastPtr <- peek curPtrPtr
127 let left = lastPtr `minusPtr` curPtr
128 return $ Some chunkText (B.drop left bs)
129 (decodeChunk codepoint state)
130 in loop (ptr `plusPtr` off)
131 desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
132 93
133ord :: Char -> Int 94ord :: Char -> Int
134ord (C# c#) = I# (ord# c#) 95ord (C# c#) = I# (ord# c#)
135{-# INLINE ord #-} 96{-# INLINE ord #-}
136 97
137
138unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int 98unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
139unsafeWrite marr i c 99unsafeWrite marr i c
140 | n < 0x10000 = do 100 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
141 A.unsafeWrite marr i (fromIntegral n) 101 return 1
142 return 1 102 | otherwise = do A.unsafeWrite marr i lo
143 | otherwise = do 103 A.unsafeWrite marr (i+1) hi
144 A.unsafeWrite marr i lo 104 return 2
145 A.unsafeWrite marr (i+1) hi
146 return 2
147 where n = ord c 105 where n = ord c
148 m = n - 0x10000 106 m = n - 0x10000
149 lo = fromIntegral $ (m `shiftR` 10) + 0xD800 107 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
diff --git a/test/Test.hs b/test/Test.hs
index 1579f2b..66351d1 100644
--- a/test/Test.hs
+++ b/test/Test.hs
@@ -8,6 +8,7 @@ import Test.Framework.Providers.QuickCheck2 (testProperty)
8import Control.Exception (catch) 8import Control.Exception (catch)
9import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord) 9import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
10import Data.Monoid (Monoid(..)) 10import Data.Monoid (Monoid(..))
11import Control.Monad
11import Data.String (fromString) 12import Data.String (fromString)
12import Data.Text.Encoding.Error 13import Data.Text.Encoding.Error
13import qualified Data.List as L 14import qualified Data.List as L
@@ -19,15 +20,20 @@ import qualified Data.Text as T
19import qualified Data.Text.Lazy as TL 20import qualified Data.Text.Lazy as TL
20import qualified Data.Text.Encoding as E 21import qualified Data.Text.Encoding as E
21import qualified Pipes.Text.Internal as PE 22import qualified Pipes.Text.Internal as PE
23import qualified Pipes.Text as TP
24import qualified Pipes.ByteString as BP
25import qualified Pipes as P
22 26
27
28import Debug.Trace
23main :: IO () 29main :: IO ()
24main = defaultMain [tests] 30main = defaultMain [tests]
25-- >>> :main -a 10000 31-- >>> :main -a 10000
26 32
27tests = testGroup "stream_decode" [ 33tests = testGroup "stream_decode" [
28 34 -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid,
29 testProperty "t_utf8_incr_valid" t_utf8_incr_valid, 35 testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed,
30 testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed] 36 testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe]
31 37
32t_utf8_incr_valid = do 38t_utf8_incr_valid = do
33 Positive n <- arbitrary 39 Positive n <- arbitrary
@@ -43,18 +49,46 @@ t_utf8_incr_valid = do
43 PE.Some t _ f' -> t : feedChunksOf n f' b 49 PE.Some t _ f' -> t : feedChunksOf n f' b
44 _ -> [] 50 _ -> []
45 51
46t_utf8_incr_mixed = do 52t_utf8_incr_mixed = do
47 Positive n <- arbitrary 53 Positive n <- arbitrary
48 txt <- genUnicode 54 txt <- genUnicode
49 forAll (vector 9) $ (roundtrip . chunk (mod n 7 + 1) . appendBytes txt) `eq` appendBytes txt 55 let chunkSize = mod n 7 + 1
56 forAll (vector 9) $
57 (roundtrip . chunk chunkSize . appendBytes txt) `eq` (appendBytes txt)
50 where 58 where
51 roundtrip :: [B.ByteString] -> B.ByteString 59 roundtrip :: [B.ByteString] -> B.ByteString
52 roundtrip bss = go (PE.streamDecodeUtf8With Nothing) B.empty B.empty bss where 60 roundtrip bss = go PE.streamDecodeUtf8 B.empty bss where
53 go dec acc old [] = acc <> old 61 go dec acc [] = acc
54 go dec acc old (bs:bss) = case dec bs of 62 go dec acc [bs] = case dec bs of
55 PE.Some t new dec' -> if T.null t then go dec' (acc <> E.encodeUtf8 t) (old <> new) bss 63 PE.Some t l dec' -> acc <> E.encodeUtf8 t <> l
56 else go dec' (acc <> E.encodeUtf8 t) new bss 64 PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs'
57 PE.Other t bs' -> if T.null t then acc <> old <> bs <> B.concat bss 65 go dec acc (bs:bss) = case dec bs of
58 else acc <> E.encodeUtf8 t <> bs' <> B.concat bss 66 PE.Some t l dec' -> go dec' (acc <> E.encodeUtf8 t) bss
67 PE.Other t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
68 chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
69 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
70
71
72
73
74t_utf8_incr_pipe = do
75 Positive m <- arbitrary
76 Positive n <- arbitrary
77 txt <- genUnicode
78 let chunkSize = mod n 7 + 1
79 bytesLength = mod 20 m
80 forAll (vector bytesLength) $
81 (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt)
82 `eq`
83 appendBytes txt
84 where
85 roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r
86 roundtrip p = do pbs <- TP.decodeUtf8 p P.>-> TP.encodeUtf8
87 pbs
59 chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b 88 chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
60 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append 89 appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append
90
91
92
93
94