diff options
-rw-r--r-- | Pipes/Text.hs | 74 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 264 | ||||
-rw-r--r-- | Pipes/Text/Parse.hs | 18 | ||||
-rw-r--r-- | pipes-text.cabal | 4 | ||||
-rw-r--r-- | test/Test.hs | 17 |
5 files changed, 318 insertions, 59 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index cf493e9..99e4ed6 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -1,9 +1,12 @@ | |||
1 | {-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-} | 1 | {-# LANGUAGE RankNTypes, TypeFamilies, BangPatterns #-} |
2 | 2 | #if __GLASGOW_HASKELL__ >= 702 | |
3 | {-# LANGUAGE Trustworthy #-} | ||
4 | #endif | ||
3 | {-| This module provides @pipes@ utilities for \"text streams\", which are | 5 | {-| This module provides @pipes@ utilities for \"text streams\", which are |
4 | streams of 'Text' chunks. The individual chunks are uniformly @strict@, but | 6 | streams of 'Text' chunks. The individual chunks are uniformly @strict@, but |
5 | a 'Producer' can be converted to and from lazy 'Text's; an 'IO.Handle' can | 7 | a 'Producer' can be converted to and from lazy 'Text's, though this is generally |
6 | be associated with a 'Producer' or 'Consumer' according as it is read or written to. | 8 | unwise. Where pipes IO replaces lazy IO, 'Producer Text m r' replaces lazy 'Text'. |
9 | An 'IO.Handle' can be associated with a 'Producer' or 'Consumer' according as it is read or written to. | ||
7 | 10 | ||
8 | To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For | 11 | To stream to or from 'IO.Handle's, one can use 'fromHandle' or 'toHandle'. For |
9 | example, the following program copies a document from one file to another: | 12 | example, the following program copies a document from one file to another: |
@@ -52,9 +55,9 @@ To stream from files, the following is perhaps more Prelude-like (note that it u | |||
52 | 55 | ||
53 | Note that functions in this library are designed to operate on streams that | 56 | Note that functions in this library are designed to operate on streams that |
54 | are insensitive to text boundaries. This means that they may freely split | 57 | are insensitive to text boundaries. This means that they may freely split |
55 | text into smaller texts and /discard empty texts/. However, they will | 58 | text into smaller texts, /discard empty texts/. However, apart from the |
56 | /never concatenate texts/ in order to provide strict upper bounds on memory | 59 | special case of 'concatMap', they will /never concatenate texts/ in order |
57 | usage. | 60 | to provide strict upper bounds on memory usage -- with the single exception of 'concatMap'. |
58 | -} | 61 | -} |
59 | 62 | ||
60 | module Pipes.Text ( | 63 | module Pipes.Text ( |
@@ -91,7 +94,7 @@ module Pipes.Text ( | |||
91 | -- * Folds | 94 | -- * Folds |
92 | toLazy, | 95 | toLazy, |
93 | toLazyM, | 96 | toLazyM, |
94 | fold, | 97 | foldChars, |
95 | head, | 98 | head, |
96 | last, | 99 | last, |
97 | null, | 100 | null, |
@@ -116,6 +119,7 @@ module Pipes.Text ( | |||
116 | lines, | 119 | lines, |
117 | words, | 120 | words, |
118 | decodeUtf8, | 121 | decodeUtf8, |
122 | decode, | ||
119 | -- * Transformations | 123 | -- * Transformations |
120 | intersperse, | 124 | intersperse, |
121 | 125 | ||
@@ -139,7 +143,7 @@ module Pipes.Text ( | |||
139 | ) where | 143 | ) where |
140 | 144 | ||
141 | import Control.Exception (throwIO, try) | 145 | import Control.Exception (throwIO, try) |
142 | import Control.Monad (liftM, unless) | 146 | import Control.Monad (liftM, unless, join) |
143 | import Control.Monad.Trans.State.Strict (StateT(..)) | 147 | import Control.Monad.Trans.State.Strict (StateT(..)) |
144 | import Data.Monoid ((<>)) | 148 | import Data.Monoid ((<>)) |
145 | import qualified Data.Text as T | 149 | import qualified Data.Text as T |
@@ -160,13 +164,14 @@ import Foreign.C.Error (Errno(Errno), ePIPE) | |||
160 | import qualified GHC.IO.Exception as G | 164 | import qualified GHC.IO.Exception as G |
161 | import Pipes | 165 | import Pipes |
162 | import qualified Pipes.ByteString as PB | 166 | import qualified Pipes.ByteString as PB |
163 | import qualified Pipes.ByteString.Parse as PBP | 167 | import qualified Pipes.ByteString as PBP |
164 | import qualified Pipes.Text.Internal as PE | 168 | import qualified Pipes.Text.Internal as PE |
169 | import Pipes.Text.Internal (Codec(..)) | ||
165 | import Pipes.Text.Parse ( | 170 | import Pipes.Text.Parse ( |
166 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) | 171 | nextChar, drawChar, unDrawChar, peekChar, isEndOfChars ) |
167 | import Pipes.Core (respond, Server') | 172 | import Pipes.Core (respond, Server') |
168 | import qualified Pipes.Parse as PP | 173 | import qualified Pipes.Parse as PP |
169 | import Pipes.Parse (input, concat, FreeT) | 174 | import Pipes.Parse ( FreeT) |
170 | import qualified Pipes.Safe.Prelude as Safe | 175 | import qualified Pipes.Safe.Prelude as Safe |
171 | import qualified Pipes.Safe as Safe | 176 | import qualified Pipes.Safe as Safe |
172 | import Pipes.Safe (MonadSafe(..), Base(..)) | 177 | import Pipes.Safe (MonadSafe(..), Base(..)) |
@@ -499,10 +504,10 @@ toLazyM = liftM TL.fromChunks . P.toListM | |||
499 | {-# INLINABLE toLazyM #-} | 504 | {-# INLINABLE toLazyM #-} |
500 | 505 | ||
501 | -- | Reduce the text stream using a strict left fold over characters | 506 | -- | Reduce the text stream using a strict left fold over characters |
502 | fold | 507 | foldChars |
503 | :: Monad m | 508 | :: Monad m |
504 | => (x -> Char -> x) -> x -> (x -> r) -> Producer Text m () -> m r | 509 | => (x -> Char -> x) -> x -> (x -> r) -> Producer Text m () -> m r |
505 | fold step begin done = P.fold (T.foldl' step) begin done | 510 | foldChars step begin done = P.fold (T.foldl' step) begin done |
506 | {-# INLINABLE fold #-} | 511 | {-# INLINABLE fold #-} |
507 | 512 | ||
508 | -- | Retrieve the first 'Char' | 513 | -- | Retrieve the first 'Char' |
@@ -879,4 +884,45 @@ unwords = intercalate (yield $ T.pack " ") | |||
879 | @Data.Text@ re-exports the 'Text' type. | 884 | @Data.Text@ re-exports the 'Text' type. |
880 | 885 | ||
881 | @Pipes.Parse@ re-exports 'input', 'concat', and 'FreeT' (the type). | 886 | @Pipes.Parse@ re-exports 'input', 'concat', and 'FreeT' (the type). |
882 | -} \ No newline at end of file | 887 | -} |
888 | |||
889 | |||
890 | |||
891 | decode :: Monad m => PE.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
892 | -- decode codec = go B.empty where | ||
893 | -- go extra p0 = | ||
894 | -- do x <- lift (next p0) | ||
895 | -- case x of Right (chunk, p) -> | ||
896 | -- do let (text, stuff) = codecDecode codec (B.append extra chunk) | ||
897 | -- yield text | ||
898 | -- case stuff of Right extra' -> go extra' p | ||
899 | -- Left (exc,bs) -> do yield text | ||
900 | -- return (do yield bs | ||
901 | -- p) | ||
902 | -- Left r -> return (do yield extra | ||
903 | -- return r) | ||
904 | |||
905 | decode d p0 = case d of | ||
906 | PE.Other txt bad -> do yield txt | ||
907 | return (do yield bad | ||
908 | p0) | ||
909 | PE.Some txt extra dec -> do yield txt | ||
910 | x <- lift (next p0) | ||
911 | case x of Left r -> return (do yield extra | ||
912 | return r) | ||
913 | Right (chunk,p1) -> decode (dec chunk) p1 | ||
914 | |||
915 | -- go !carry dec0 p = do | ||
916 | -- x <- lift (next p) | ||
917 | -- case x of Left r -> if B.null carry | ||
918 | -- then return (return r) -- all bytestrinput was consumed | ||
919 | -- else return (do yield carry -- a potentially valid fragment remains | ||
920 | -- return r) | ||
921 | -- | ||
922 | -- Right (chunk, p') -> case dec0 chunk of | ||
923 | -- PE.Some text carry2 dec -> do yield text | ||
924 | -- go carry2 dec p' | ||
925 | -- PE.Other text bs -> do yield text | ||
926 | -- return (do yield bs -- an invalid blob remains | ||
927 | -- p') | ||
928 | -- {-# INLINABLE decodeUtf8 #-} | ||
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 7e5b044..76c2f4f 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs | |||
@@ -1,5 +1,7 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, | 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} |
2 | UnliftedFFITypes #-} | 2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} |
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
3 | -- This module lifts assorted materials from Brian O'Sullivan's text package | 5 | -- This module lifts assorted materials from Brian O'Sullivan's text package |
4 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | 6 | -- especially Data.Text.Encoding in order to define a pipes-appropriate |
5 | -- streamDecodeUtf8 | 7 | -- streamDecodeUtf8 |
@@ -7,13 +9,20 @@ module Pipes.Text.Internal | |||
7 | ( Decoding(..) | 9 | ( Decoding(..) |
8 | , streamDecodeUtf8 | 10 | , streamDecodeUtf8 |
9 | , decodeSomeUtf8 | 11 | , decodeSomeUtf8 |
12 | , Codec(..) | ||
13 | , TextException(..) | ||
14 | , utf8 | ||
10 | ) where | 15 | ) where |
11 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | 16 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) |
12 | import Control.Monad.ST (ST, runST) | 17 | import Control.Monad.ST (ST, runST) |
13 | import Data.Bits ((.&.)) | 18 | import Data.Bits ((.&.)) |
14 | import Data.ByteString as B | 19 | import Data.ByteString as B |
20 | import Data.ByteString (ByteString) | ||
15 | import Data.ByteString.Internal as B | 21 | import Data.ByteString.Internal as B |
16 | import qualified Data.Text as T (null) | 22 | import Data.ByteString.Char8 as B8 |
23 | import Data.Text (Text) | ||
24 | import qualified Data.Text as T | ||
25 | import qualified Data.Text.Encoding as TE | ||
17 | import Data.Text.Encoding.Error () | 26 | import Data.Text.Encoding.Error () |
18 | import Data.Text.Internal (Text, textP) | 27 | import Data.Text.Internal (Text, textP) |
19 | import Foreign.C.Types (CSize) | 28 | import Foreign.C.Types (CSize) |
@@ -24,9 +33,226 @@ import Foreign.Storable (Storable, peek, poke) | |||
24 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | 33 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) |
25 | import GHC.Word (Word8, Word32) | 34 | import GHC.Word (Word8, Word32) |
26 | import qualified Data.Text.Array as A | 35 | import qualified Data.Text.Array as A |
27 | 36 | import Data.Word (Word8, Word16) | |
37 | import System.IO.Unsafe (unsafePerformIO) | ||
38 | import qualified Control.Exception as Exc | ||
39 | import Data.Bits ((.&.), (.|.), shiftL) | ||
40 | import Data.Typeable | ||
41 | import Control.Arrow (first) | ||
42 | import Data.Maybe (catMaybes) | ||
28 | #include "pipes_text_cbits.h" | 43 | #include "pipes_text_cbits.h" |
29 | 44 | ||
45 | |||
46 | -- | A specific character encoding. | ||
47 | -- | ||
48 | -- Since 0.3.0 | ||
49 | data Codec = Codec | ||
50 | { codecName :: Text | ||
51 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | ||
52 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | ||
53 | } | ||
54 | |||
55 | instance Show Codec where | ||
56 | showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c) | ||
57 | |||
58 | -- Since 0.3.0 | ||
59 | data TextException = DecodeException Codec Word8 | ||
60 | | EncodeException Codec Char | ||
61 | | LengthExceeded Int | ||
62 | | TextException Exc.SomeException | ||
63 | deriving (Show, Typeable) | ||
64 | instance Exc.Exception TextException | ||
65 | |||
66 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | ||
67 | -> (ByteString -> Decoding) | ||
68 | toDecoding op = loop B.empty where | ||
69 | loop extra bs0 = case op (B.append extra bs0) of | ||
70 | (txt, Right bs) -> Some txt bs (loop bs) | ||
71 | (txt, Left (_,bs)) -> Other txt bs | ||
72 | |||
73 | |||
74 | splitSlowly :: (ByteString -> Text) | ||
75 | -> ByteString | ||
76 | -> (Text, Either (TextException, ByteString) ByteString) | ||
77 | splitSlowly dec bytes = valid where | ||
78 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | ||
79 | splits 0 = [(B.empty, bytes)] | ||
80 | splits n = B.splitAt n bytes : splits (n - 1) | ||
81 | decFirst (a, b) = case tryEvaluate (dec a) of | ||
82 | Left _ -> Nothing | ||
83 | Right text -> let trouble = case tryEvaluate (dec b) of | ||
84 | Left exc -> Left (TextException exc, b) | ||
85 | Right _ -> Right B.empty | ||
86 | in Just (text, trouble) | ||
87 | -- this case shouldn't occur, | ||
88 | -- since splitSlowly is only called | ||
89 | -- when parsing failed somewhere | ||
90 | |||
91 | utf8 :: Codec | ||
92 | utf8 = Codec name enc (toDecoding dec) where | ||
93 | name = T.pack "UTF-8" | ||
94 | enc text = (TE.encodeUtf8 text, Nothing) | ||
95 | dec bytes = case decodeSomeUtf8 bytes of | ||
96 | (t,b) -> (t, Right b) | ||
97 | |||
98 | -- -- Whether the given byte is a continuation byte. | ||
99 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | ||
100 | -- | ||
101 | -- -- The number of continuation bytes needed by the given | ||
102 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | ||
103 | -- -- non-continuation byte and the whole split quickly must fail so | ||
104 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | ||
105 | -- -- suitable error. | ||
106 | -- required x0 | ||
107 | -- | x0 .&. 0x80 == 0x00 = 0 | ||
108 | -- | x0 .&. 0xE0 == 0xC0 = 1 | ||
109 | -- | x0 .&. 0xF0 == 0xE0 = 2 | ||
110 | -- | x0 .&. 0xF8 == 0xF0 = 3 | ||
111 | -- | otherwise = -1 | ||
112 | -- | ||
113 | -- splitQuickly bytes | ||
114 | -- | B.null l || req == -1 = Nothing | ||
115 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | ||
116 | -- | otherwise = Just (TE.decodeUtf8 l', r') | ||
117 | -- where | ||
118 | -- (l, r) = B.spanEnd isContinuation bytes | ||
119 | -- req = required (B.last l) | ||
120 | -- l' = B.init l | ||
121 | -- r' = B.cons (B.last l) r | ||
122 | |||
123 | -- | | ||
124 | -- Since 0.3.0 | ||
125 | utf16_le :: Codec | ||
126 | utf16_le = Codec name enc (toDecoding dec) where | ||
127 | name = T.pack "UTF-16-LE" | ||
128 | enc text = (TE.encodeUtf16LE text, Nothing) | ||
129 | dec bytes = case splitQuickly bytes of | ||
130 | Just (text, extra) -> (text, Right extra) | ||
131 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | ||
132 | |||
133 | splitQuickly bytes = maybeDecode (loop 0) where | ||
134 | maxN = B.length bytes | ||
135 | |||
136 | loop n | n == maxN = decodeAll | ||
137 | | (n + 1) == maxN = decodeTo n | ||
138 | loop n = let | ||
139 | req = utf16Required | ||
140 | (B.index bytes n) | ||
141 | (B.index bytes (n + 1)) | ||
142 | decodeMore = loop $! n + req | ||
143 | in if n + req > maxN | ||
144 | then decodeTo n | ||
145 | else decodeMore | ||
146 | |||
147 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | ||
148 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | ||
149 | |||
150 | -- | | ||
151 | -- Since 0.3.0 | ||
152 | utf16_be :: Codec | ||
153 | utf16_be = Codec name enc (toDecoding dec) where | ||
154 | name = T.pack "UTF-16-BE" | ||
155 | enc text = (TE.encodeUtf16BE text, Nothing) | ||
156 | dec bytes = case splitQuickly bytes of | ||
157 | Just (text, extra) -> (text, Right extra) | ||
158 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | ||
159 | |||
160 | splitQuickly bytes = maybeDecode (loop 0) where | ||
161 | maxN = B.length bytes | ||
162 | |||
163 | loop n | n == maxN = decodeAll | ||
164 | | (n + 1) == maxN = decodeTo n | ||
165 | loop n = let | ||
166 | req = utf16Required | ||
167 | (B.index bytes (n + 1)) | ||
168 | (B.index bytes n) | ||
169 | decodeMore = loop $! n + req | ||
170 | in if n + req > maxN | ||
171 | then decodeTo n | ||
172 | else decodeMore | ||
173 | |||
174 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | ||
175 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | ||
176 | |||
177 | utf16Required :: Word8 -> Word8 -> Int | ||
178 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | ||
179 | x :: Word16 | ||
180 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | ||
181 | |||
182 | -- | | ||
183 | -- Since 0.3.0 | ||
184 | utf32_le :: Codec | ||
185 | utf32_le = Codec name enc (toDecoding dec) where | ||
186 | name = T.pack "UTF-32-LE" | ||
187 | enc text = (TE.encodeUtf32LE text, Nothing) | ||
188 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | ||
189 | Just (text, extra) -> (text, Right extra) | ||
190 | Nothing -> splitSlowly TE.decodeUtf32LE bs | ||
191 | |||
192 | -- | | ||
193 | -- Since 0.3.0 | ||
194 | utf32_be :: Codec | ||
195 | utf32_be = Codec name enc (toDecoding dec) where | ||
196 | name = T.pack "UTF-32-BE" | ||
197 | enc text = (TE.encodeUtf32BE text, Nothing) | ||
198 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | ||
199 | Just (text, extra) -> (text, Right extra) | ||
200 | Nothing -> splitSlowly TE.decodeUtf32BE bs | ||
201 | |||
202 | utf32SplitBytes :: (ByteString -> Text) | ||
203 | -> ByteString | ||
204 | -> Maybe (Text, ByteString) | ||
205 | utf32SplitBytes dec bytes = split where | ||
206 | split = maybeDecode (dec toDecode, extra) | ||
207 | len = B.length bytes | ||
208 | lenExtra = mod len 4 | ||
209 | |||
210 | lenToDecode = len - lenExtra | ||
211 | (toDecode, extra) = if lenExtra == 0 | ||
212 | then (bytes, B.empty) | ||
213 | else B.splitAt lenToDecode bytes | ||
214 | |||
215 | -- | | ||
216 | -- Since 0.3.0 | ||
217 | ascii :: Codec | ||
218 | ascii = Codec name enc (toDecoding dec) where | ||
219 | name = T.pack "ASCII" | ||
220 | enc text = (bytes, extra) where | ||
221 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | ||
222 | bytes = B8.pack (T.unpack safe) | ||
223 | extra = if T.null unsafe | ||
224 | then Nothing | ||
225 | else Just (EncodeException ascii (T.head unsafe), unsafe) | ||
226 | |||
227 | dec bytes = (text, extra) where | ||
228 | (safe, unsafe) = B.span (<= 0x7F) bytes | ||
229 | text = T.pack (B8.unpack safe) | ||
230 | extra = if B.null unsafe | ||
231 | then Right B.empty | ||
232 | else Left (DecodeException ascii (B.head unsafe), unsafe) | ||
233 | |||
234 | -- | | ||
235 | -- Since 0.3.0 | ||
236 | iso8859_1 :: Codec | ||
237 | iso8859_1 = Codec name enc (toDecoding dec) where | ||
238 | name = T.pack "ISO-8859-1" | ||
239 | enc text = (bytes, extra) where | ||
240 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | ||
241 | bytes = B8.pack (T.unpack safe) | ||
242 | extra = if T.null unsafe | ||
243 | then Nothing | ||
244 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | ||
245 | |||
246 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | ||
247 | |||
248 | tryEvaluate :: a -> Either Exc.SomeException a | ||
249 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | ||
250 | |||
251 | maybeDecode :: (a, b) -> Maybe (a, b) | ||
252 | maybeDecode (a, b) = case tryEvaluate a of | ||
253 | Left _ -> Nothing | ||
254 | Right _ -> Just (a, b) | ||
255 | |||
30 | -- | A stream oriented decoding result. | 256 | -- | A stream oriented decoding result. |
31 | data Decoding = Some Text ByteString (ByteString -> Decoding) | 257 | data Decoding = Some Text ByteString (ByteString -> Decoding) |
32 | | Other Text ByteString | 258 | | Other Text ByteString |
@@ -103,36 +329,6 @@ decodeSomeUtf8 bs@(PS fp off len) = runST $ do | |||
103 | return $! (chunkText, remaining) | 329 | return $! (chunkText, remaining) |
104 | {-# INLINE decodeSomeUtf8 #-} | 330 | {-# INLINE decodeSomeUtf8 #-} |
105 | 331 | ||
106 | -- decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
107 | -- decodeSomeUtf8 bs@(PS fp off len) = | ||
108 | -- runST $ do marray <- A.new (len+1) | ||
109 | -- unsafeIOToST (decodeChunkToBuffer marray) | ||
110 | -- | ||
111 | -- where | ||
112 | -- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString) | ||
113 | -- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
114 | -- with (0::CSize) $ \destOffPtr -> | ||
115 | -- with (0::CodePoint) $ \codepointPtr -> | ||
116 | -- with (0::DecoderState) $ \statePtr -> | ||
117 | -- with nullPtr $ \curPtrPtr -> | ||
118 | -- do let end = ptr `plusPtr` (off + len) | ||
119 | -- curPtr = ptr `plusPtr` off | ||
120 | -- poke curPtrPtr curPtr | ||
121 | -- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
122 | -- state <- peek statePtr | ||
123 | -- lastPtr <- peek curPtrPtr | ||
124 | -- codepoint <- peek codepointPtr | ||
125 | -- n <- peek destOffPtr | ||
126 | -- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
127 | -- return $! textP arr 0 (fromIntegral n) | ||
128 | -- let left = lastPtr `minusPtr` curPtr | ||
129 | -- remaining = B.drop left bs | ||
130 | -- return $! (chunkText, remaining) | ||
131 | -- {-# INLINE decodeChunkToBuffer #-} | ||
132 | -- {-# INLINE decodeSomeUtf8 #-} | ||
133 | |||
134 | |||
135 | |||
136 | mkText :: A.MArray s -> CSize -> IO Text | 332 | mkText :: A.MArray s -> CSize -> IO Text |
137 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | 333 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest |
138 | return $! textP arr 0 (fromIntegral n) | 334 | return $! textP arr 0 (fromIntegral n) |
diff --git a/Pipes/Text/Parse.hs b/Pipes/Text/Parse.hs index ed0afa1..9cabaa6 100644 --- a/Pipes/Text/Parse.hs +++ b/Pipes/Text/Parse.hs | |||
@@ -44,16 +44,16 @@ nextChar = go | |||
44 | {-| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the | 44 | {-| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the |
45 | 'Producer' is empty | 45 | 'Producer' is empty |
46 | -} | 46 | -} |
47 | drawChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char) | 47 | drawChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char) |
48 | drawChar = do | 48 | drawChar = do |
49 | x <- PP.draw | 49 | x <- PP.draw |
50 | case x of | 50 | case x of |
51 | Left r -> return (Left r) | 51 | Nothing -> return Nothing |
52 | Right txt -> case (T.uncons txt) of | 52 | Just txt -> case (T.uncons txt) of |
53 | Nothing -> drawChar | 53 | Nothing -> drawChar |
54 | Just (c, txt') -> do | 54 | Just (c, txt') -> do |
55 | PP.unDraw txt' | 55 | PP.unDraw txt' |
56 | return (Right c) | 56 | return (Just c) |
57 | {-# INLINABLE drawChar #-} | 57 | {-# INLINABLE drawChar #-} |
58 | 58 | ||
59 | -- | Push back a 'Char' onto the underlying 'Producer' | 59 | -- | Push back a 'Char' onto the underlying 'Producer' |
@@ -71,12 +71,12 @@ unDrawChar c = modify (yield (T.singleton c) >>) | |||
71 | > Right c -> unDrawChar c | 71 | > Right c -> unDrawChar c |
72 | > return x | 72 | > return x |
73 | -} | 73 | -} |
74 | peekChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char) | 74 | peekChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char) |
75 | peekChar = do | 75 | peekChar = do |
76 | x <- drawChar | 76 | x <- drawChar |
77 | case x of | 77 | case x of |
78 | Left _ -> return () | 78 | Nothing -> return () |
79 | Right c -> unDrawChar c | 79 | Just c -> unDrawChar c |
80 | return x | 80 | return x |
81 | {-# INLINABLE peekChar #-} | 81 | {-# INLINABLE peekChar #-} |
82 | 82 | ||
@@ -91,8 +91,8 @@ isEndOfChars :: (Monad m) => StateT (Producer Text m r) m Bool | |||
91 | isEndOfChars = do | 91 | isEndOfChars = do |
92 | x <- peekChar | 92 | x <- peekChar |
93 | return (case x of | 93 | return (case x of |
94 | Left _ -> True | 94 | Nothing -> True |
95 | Right _ -> False ) | 95 | Just _-> False ) |
96 | {-# INLINABLE isEndOfChars #-} | 96 | {-# INLINABLE isEndOfChars #-} |
97 | 97 | ||
98 | {-| @(take n)@ only allows @n@ characters to pass | 98 | {-| @(take n)@ only allows @n@ characters to pass |
diff --git a/pipes-text.cabal b/pipes-text.cabal index b4388be..4e77d1b 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -20,9 +20,9 @@ library | |||
20 | build-depends: base >= 4 && < 5 , | 20 | build-depends: base >= 4 && < 5 , |
21 | transformers >= 0.2.0.0 && < 0.4, | 21 | transformers >= 0.2.0.0 && < 0.4, |
22 | pipes >=4.0 && < 4.2, | 22 | pipes >=4.0 && < 4.2, |
23 | pipes-parse >=2.0 && < 2.2, | 23 | pipes-parse >=2.0 && < 3.1, |
24 | pipes-safe, | 24 | pipes-safe, |
25 | pipes-bytestring >= 1.0 && < 1.2, | 25 | pipes-bytestring >= 1.0 && < 2.1, |
26 | transformers >= 0.3 && < 0.4, | 26 | transformers >= 0.3 && < 0.4, |
27 | text >=0.11 && < 0.12, | 27 | text >=0.11 && < 0.12, |
28 | bytestring >=0.10 && < 0.11, | 28 | bytestring >=0.10 && < 0.11, |
diff --git a/test/Test.hs b/test/Test.hs index 373bafb..7832f76 100644 --- a/test/Test.hs +++ b/test/Test.hs | |||
@@ -31,6 +31,7 @@ tests = testGroup "stream_decode" [ | |||
31 | -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, | 31 | -- testProperty "t_utf8_incr_valid" t_utf8_incr_valid, |
32 | testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed , | 32 | testProperty "t_utf8_incr_mixed" t_utf8_incr_mixed , |
33 | testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe, | 33 | testProperty "t_utf8_incr_pipe" t_utf8_incr_pipe, |
34 | testProperty "t_utf8_incr_decoding" t_utf8_incr_decoding, | ||
34 | testProperty "t_utf8_dec_some" t_utf8_dec_some] | 35 | testProperty "t_utf8_dec_some" t_utf8_dec_some] |
35 | 36 | ||
36 | t_utf8_incr_valid = do | 37 | t_utf8_incr_valid = do |
@@ -83,6 +84,22 @@ t_utf8_incr_pipe = do | |||
83 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | 84 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append |
84 | 85 | ||
85 | -- | 86 | -- |
87 | t_utf8_incr_decoding = do | ||
88 | Positive m <- arbitrary | ||
89 | Positive n <- arbitrary | ||
90 | txt <- genUnicode | ||
91 | let chunkSize = mod n 7 + 1 | ||
92 | bytesLength = mod 10 m | ||
93 | forAll (vector bytesLength) $ | ||
94 | (BL.toStrict . BP.toLazy . roundtrip . P.each . chunk chunkSize . appendBytes txt) | ||
95 | `eq` | ||
96 | appendBytes txt | ||
97 | where | ||
98 | roundtrip :: Monad m => P.Producer B.ByteString m r -> P.Producer B.ByteString m r | ||
99 | roundtrip p = join (TP.decode utf8_start p P.>-> TP.encodeUtf8) | ||
100 | chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b | ||
101 | appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts ; (<>) = B.append | ||
102 | utf8_start = PE.Some T.empty B.empty (PE.codecDecode PE.utf8) | ||
86 | t_utf8_dec_some = do | 103 | t_utf8_dec_some = do |
87 | Positive m <- arbitrary | 104 | Positive m <- arbitrary |
88 | txt <- genUnicode | 105 | txt <- genUnicode |