aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2014-02-15 10:01:48 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2014-02-15 10:01:48 -0500
commitbbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree5765f3f224708890cf50b400c675c23e759595a2
parentfd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
downloadtext-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip
use new text-stream-decoding
-rw-r--r--Pipes/Text.hs269
-rw-r--r--Pipes/Text/Encoding.hs205
-rw-r--r--Pipes/Text/IO.hs96
-rw-r--r--Pipes/Text/Internal.hs7
-rw-r--r--Pipes/Text/Internal/Codec.hs216
-rw-r--r--Pipes/Text/Internal/Decoding.hs154
-rw-r--r--cbits/cbits.c168
-rw-r--r--include/pipes_text_cbits.h11
-rw-r--r--pipes-text.cabal39
9 files changed, 357 insertions, 808 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs
index 4b2d2b0..8221c01 100644
--- a/Pipes/Text.hs
+++ b/Pipes/Text.hs
@@ -10,7 +10,8 @@
10 example, the following program copies a document from one file to another: 10 example, the following program copies a document from one file to another:
11 11
12> import Pipes 12> import Pipes
13> import qualified Data.Text.Pipes as Text 13> import qualified Pipes.Text as Text
14> import qualified Pipes.Text.IO as Text
14> import System.IO 15> import System.IO
15> 16>
16> main = 17> main =
@@ -21,7 +22,8 @@
21To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): 22To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe):
22 23
23> import Pipes 24> import Pipes
24> import qualified Data.Text.Pipes as Text 25> import qualified Pipes.Text as Text
26> import qualified Pipes.Text.IO as Text
25> import Pipes.Safe 27> import Pipes.Safe
26> 28>
27> main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" 29> main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt"
@@ -61,14 +63,14 @@ To stream from files, the following is perhaps more Prelude-like (note that it u
61module Pipes.Text ( 63module Pipes.Text (
62 -- * Producers 64 -- * Producers
63 fromLazy 65 fromLazy
64 , stdin 66 -- , stdin
65 , fromHandle 67 -- , fromHandle
66 , readFile 68 -- , readFile
67 69
68 -- * Consumers 70 -- * Consumers
69 , stdout 71 -- , stdout
70 , toHandle 72 -- , toHandle
71 , writeFile 73 -- , writeFile
72 74
73 -- * Pipes 75 -- * Pipes
74 , map 76 , map
@@ -79,7 +81,7 @@ module Pipes.Text (
79 , dropWhile 81 , dropWhile
80 , filter 82 , filter
81 , scan 83 , scan
82 , encodeUtf8 84-- , encodeUtf8
83 , pack 85 , pack
84 , unpack 86 , unpack
85 , toCaseFold 87 , toCaseFold
@@ -120,22 +122,22 @@ module Pipes.Text (
120 , word 122 , word
121 , line 123 , line
122 124
123 -- * Decoding Lenses 125 -- -- * Decoding Lenses
124 , decodeUtf8 126 -- , decodeUtf8
125 , codec 127 -- , codec
126 128 --
127 -- * Codecs 129 -- -- * Codecs
128 , utf8 130 -- , utf8
129 , utf16_le 131 -- , utf16_le
130 , utf16_be 132 -- , utf16_be
131 , utf32_le 133 -- , utf32_le
132 , utf32_be 134 -- , utf32_be
133 135 --
134 -- * Other Decoding/Encoding Functions 136 -- -- * Other Decoding/Encoding Functions
135 , decodeIso8859_1 137 -- , decodeIso8859_1
136 , decodeAscii 138 -- , decodeAscii
137 , encodeIso8859_1 139 -- , encodeIso8859_1
138 , encodeAscii 140 -- , encodeAscii
139 141
140 -- * FreeT Splitters 142 -- * FreeT Splitters
141 , chunksOf 143 , chunksOf
@@ -157,11 +159,9 @@ module Pipes.Text (
157 159
158 -- * Re-exports 160 -- * Re-exports
159 -- $reexports 161 -- $reexports
160 , Decoding(..) 162 -- , DecodeResult(..)
161 , streamDecodeUtf8 163 -- , Codec
162 , decodeSomeUtf8 164 -- , TextException(..)
163 , Codec(..)
164 , TextException(..)
165 , module Data.ByteString 165 , module Data.ByteString
166 , module Data.Text 166 , module Data.Text
167 , module Data.Profunctor 167 , module Data.Profunctor
@@ -170,7 +170,6 @@ module Pipes.Text (
170 , module Pipes.Group 170 , module Pipes.Group
171 ) where 171 ) where
172 172
173import Control.Exception (throwIO, try)
174import Control.Applicative ((<*)) 173import Control.Applicative ((<*))
175import Control.Monad (liftM, unless, join) 174import Control.Monad (liftM, unless, join)
176import Control.Monad.Trans.State.Strict (StateT(..), modify) 175import Control.Monad.Trans.State.Strict (StateT(..), modify)
@@ -193,24 +192,20 @@ import Data.Functor.Identity (Identity)
193import Data.Profunctor (Profunctor) 192import Data.Profunctor (Profunctor)
194import qualified Data.Profunctor 193import qualified Data.Profunctor
195import qualified Data.List as List 194import qualified Data.List as List
196import Foreign.C.Error (Errno(Errno), ePIPE)
197import qualified GHC.IO.Exception as G
198import Pipes 195import Pipes
199import qualified Pipes.ByteString as PB 196import qualified Pipes.ByteString as PB
200import qualified Pipes.Text.Internal as PI 197-- import Pipes.Text.Decoding
201import Pipes.Text.Internal
202import Pipes.Core (respond, Server') 198import Pipes.Core (respond, Server')
203import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) 199import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..))
204import qualified Pipes.Group as PG 200import qualified Pipes.Group as PG
205import qualified Pipes.Parse as PP 201import qualified Pipes.Parse as PP
206import Pipes.Parse (Parser) 202import Pipes.Parse (Parser)
207import qualified Pipes.Safe.Prelude as Safe 203
208import qualified Pipes.Safe as Safe
209import Pipes.Safe (MonadSafe(..), Base(..))
210import qualified Pipes.Prelude as P 204import qualified Pipes.Prelude as P
211import qualified System.IO as IO 205import qualified System.IO as IO
212import Data.Char (isSpace) 206import Data.Char (isSpace)
213import Data.Word (Word8) 207import Data.Word (Word8)
208import Data.Text.StreamDecoding
214 209
215import Prelude hiding ( 210import Prelude hiding (
216 all, 211 all,
@@ -246,78 +241,6 @@ fromLazy :: (Monad m) => TL.Text -> Producer' Text m ()
246fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) 241fromLazy = foldrChunks (\e a -> yield e >> a) (return ())
247{-# INLINE fromLazy #-} 242{-# INLINE fromLazy #-}
248 243
249-- | Stream text from 'stdin'
250stdin :: MonadIO m => Producer Text m ()
251stdin = fromHandle IO.stdin
252{-# INLINE stdin #-}
253
254{-| Convert a 'IO.Handle' into a text stream using a text size
255 determined by the good sense of the text library; note that this
256 is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
257 but uses the system encoding and has other `Data.Text.IO` features
258-}
259
260fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
261fromHandle h = go where
262 go = do txt <- liftIO (T.hGetChunk h)
263 unless (T.null txt) ( do yield txt
264 go )
265{-# INLINABLE fromHandle#-}
266
267
268{-| Stream text from a file in the simple fashion of @Data.Text.IO@
269
270>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
271MAIN = PUTSTRLN "HELLO WORLD"
272-}
273
274readFile :: MonadSafe m => FilePath -> Producer Text m ()
275readFile file = Safe.withFile file IO.ReadMode fromHandle
276{-# INLINE readFile #-}
277
278
279{-| Stream text to 'stdout'
280
281 Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
282
283 Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@
284 instead of @(source >-> stdout)@ .
285-}
286stdout :: MonadIO m => Consumer' Text m ()
287stdout = go
288 where
289 go = do
290 txt <- await
291 x <- liftIO $ try (T.putStr txt)
292 case x of
293 Left (G.IOError { G.ioe_type = G.ResourceVanished
294 , G.ioe_errno = Just ioe })
295 | Errno ioe == ePIPE
296 -> return ()
297 Left e -> liftIO (throwIO e)
298 Right () -> go
299{-# INLINABLE stdout #-}
300
301
302{-| Convert a text stream into a 'Handle'
303
304 Note: again, for best performance, where possible use
305 @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@.
306-}
307toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r
308toHandle h = for cat (liftIO . T.hPutStr h)
309{-# INLINABLE toHandle #-}
310
311{-# RULES "p >-> toHandle h" forall p h .
312 p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt))
313 #-}
314
315
316-- | Stream text into a file. Uses @pipes-safe@.
317writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
318writeFile file = Safe.withFile file IO.WriteMode toHandle
319{-# INLINE writeFile #-}
320
321 244
322type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) 245type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a)
323 246
@@ -690,28 +613,6 @@ isEndOfChars = do
690{-# INLINABLE isEndOfChars #-} 613{-# INLINABLE isEndOfChars #-}
691 614
692 615
693{- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated
694 stream of Text ends by returning a stream of ByteStrings beginning at the point of failure.
695 -}
696
697decodeUtf8 :: Monad m => Lens' (Producer ByteString m r)
698 (Producer Text m (Producer ByteString m r))
699decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8)))
700 (k (go B.empty PI.streamDecodeUtf8 p0)) where
701 go !carry dec0 p = do
702 x <- lift (next p)
703 case x of Left r -> return (if B.null carry
704 then return r -- all bytestring input was consumed
705 else (do yield carry -- a potentially valid fragment remains
706 return r))
707
708 Right (chunk, p') -> case dec0 chunk of
709 PI.Some text carry2 dec -> do yield text
710 go carry2 dec p'
711 PI.Other text bs -> do yield text
712 return (do yield bs -- an invalid blob remains
713 p')
714{-# INLINABLE decodeUtf8 #-}
715 616
716 617
717-- | Splits a 'Producer' after the given number of characters 618-- | Splits a 'Producer' after the given number of characters
@@ -1057,106 +958,4 @@ unwords = intercalate (yield $ T.singleton ' ')
1057 @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. 958 @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym.
1058-} 959-}
1059 960
1060{- | Use a 'Codec' as a pipes-style 'Lens' into a byte stream; the available 'Codec' s are 961
1061 'utf8', 'utf16_le', 'utf16_be', 'utf32_le', 'utf32_be' . The 'Codec' concept and the
1062 individual 'Codec' definitions follow the enumerator and conduit libraries.
1063
1064 Utf8 is handled differently in this library -- without the use of 'unsafePerformIO' &co
1065 to catch 'Text' exceptions; but the same 'mypipe ^. codec utf8' interface can be used.
1066 'mypipe ^. decodeUtf8' should be the same, but has a somewhat more direct and thus perhaps
1067 better implementation.
1068
1069 -}
1070codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r))
1071codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc)))
1072 (k (decoder (dec B.empty) p0) ) where
1073 decoder :: Monad m => PI.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
1074 decoder !d p0 = case d of
1075 PI.Other txt bad -> do yield txt
1076 return (do yield bad
1077 p0)
1078 PI.Some txt extra dec -> do yield txt
1079 x <- lift (next p0)
1080 case x of Left r -> return (do yield extra
1081 return r)
1082 Right (chunk,p1) -> decoder (dec chunk) p1
1083
1084{- | ascii and latin encodings only represent a small fragment of 'Text'; thus we cannot
1085 use the pipes 'Lens' style to work with them. Rather we simply define functions
1086 each way.
1087
1088 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
1089 returning the rest of the 'Text' at the first non-ascii 'Char'
1090-}
1091encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
1092encodeAscii = go where
1093 go p = do echunk <- lift (next p)
1094 case echunk of
1095 Left r -> return (return r)
1096 Right (chunk, p') ->
1097 if T.null chunk
1098 then go p'
1099 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
1100 in do yield (B8.pack (T.unpack safe))
1101 if T.null unsafe
1102 then go p'
1103 else return $ do yield unsafe
1104 p'
1105{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
1106 returning the rest of the 'Text' upon hitting any non-latin 'Char'
1107 -}
1108encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
1109encodeIso8859_1 = go where
1110 go p = do etxt <- lift (next p)
1111 case etxt of
1112 Left r -> return (return r)
1113 Right (txt, p') ->
1114 if T.null txt
1115 then go p'
1116 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
1117 in do yield (B8.pack (T.unpack safe))
1118 if T.null unsafe
1119 then go p'
1120 else return $ do yield unsafe
1121 p'
1122
1123{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
1124 unused 'ByteString' upon hitting an un-ascii byte.
1125 -}
1126decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
1127decodeAscii = go where
1128 go p = do echunk <- lift (next p)
1129 case echunk of
1130 Left r -> return (return r)
1131 Right (chunk, p') ->
1132 if B.null chunk
1133 then go p'
1134 else let (safe, unsafe) = B.span (<= 0x7F) chunk
1135 in do yield (T.pack (B8.unpack safe))
1136 if B.null unsafe
1137 then go p'
1138 else return $ do yield unsafe
1139 p'
1140
1141{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
1142 unused 'ByteString' upon hitting the rare un-latinizable byte.
1143 -}
1144decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
1145decodeIso8859_1 = go where
1146 go p = do echunk <- lift (next p)
1147 case echunk of
1148 Left r -> return (return r)
1149 Right (chunk, p') ->
1150 if B.null chunk
1151 then go p'
1152 else let (safe, unsafe) = B.span (<= 0xFF) chunk
1153 in do yield (T.pack (B8.unpack safe))
1154 if B.null unsafe
1155 then go p'
1156 else return $ do yield unsafe
1157 p'
1158
1159
1160
1161
1162
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs
new file mode 100644
index 0000000..2bb5807
--- /dev/null
+++ b/Pipes/Text/Encoding.hs
@@ -0,0 +1,205 @@
1
2{-# LANGUAGE RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson
5--
6-- This module uses the stream decoding functions from the text-stream-decoding package
7-- to define pipes decoding functions and lenses.
8
9module Pipes.Text.Encoding
10 ( DecodeResult (..)
11 , Codec
12 , decodeUtf8
13 , decodeUtf8Pure
14 , decodeUtf16LE
15 , decodeUtf16BE
16 , decodeUtf32LE
17 , decodeUtf32BE
18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
24 , encodeAscii
25 , decodeAscii
26 , encodeIso8859_1
27 , decodeIso8859_1
28 )
29 where
30
31import Data.Char (ord)
32import Data.ByteString as B
33import Data.ByteString (ByteString)
34import Data.ByteString.Internal as B
35import Data.ByteString.Char8 as B8
36import Data.Text (Text)
37import qualified Data.Text as T
38import qualified Data.Text.Encoding as TE
39import Data.Text.StreamDecoding
40import GHC.Word (Word8, Word32)
41import Data.Word (Word8, Word16)
42import Control.Monad
43import Pipes
44import Pipes.Core
45
46
47
48{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
49 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
50 The stream of text they 'see' in a bytestream ends by returning the original byte stream
51 beginning at the point of failure, or the empty bytestream with its return value.
52 -}
53type Codec = forall f m r . (Functor f , Monad m ) =>
54 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
55 -> Producer ByteString m r -> f (Producer ByteString m r )
56
57decodeStream :: Monad m
58 => (B.ByteString -> DecodeResult)
59 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
60decodeStream = loop where
61 loop dec0 p =
62 do x <- lift (next p)
63 case x of Left r -> return (return r)
64 Right (chunk, p') -> case dec0 chunk of
65 DecodeResultSuccess text dec -> do yield text
66 loop dec p'
67 DecodeResultFailure text bs -> do yield text
68 return (do yield bs
69 p')
70{-# INLINABLE decodeStream#-}
71
72decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73decodeUtf8 = decodeStream streamUtf8
74{-# INLINE decodeUtf8 #-}
75
76decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
77decodeUtf8Pure = decodeStream streamUtf8Pure
78{-# INLINE decodeUtf8Pure #-}
79
80decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
81decodeUtf16LE = decodeStream streamUtf16LE
82{-# INLINE decodeUtf16LE #-}
83
84decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
85decodeUtf16BE = decodeStream streamUtf16BE
86{-# INLINE decodeUtf16BE #-}
87
88decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89decodeUtf32LE = decodeStream streamUtf32LE
90{-# INLINE decodeUtf32LE #-}
91
92decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93decodeUtf32BE = decodeStream streamUtf32BE
94{-# INLINE decodeUtf32BE #-}
95
96mkCodec :: (forall r m . Monad m =>
97 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
98 -> (Text -> ByteString)
99 -> Codec
100mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
101
102
103{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
104 text stream ends by returning the original bytestream beginning at the point of failure,
105 or the empty bytestring for a well-encoded text.
106 -}
107
108utf8 :: Codec
109utf8 = mkCodec decodeUtf8 TE.encodeUtf8
110
111utf8Pure :: Codec
112utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
113
114utf16LE :: Codec
115utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
116
117utf16BE :: Codec
118utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
119
120utf32LE :: Codec
121utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
122
123utf32BE :: Codec
124utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
125
126
127{- | ascii and latin encodings only use a small number of the characters 'Text'
128 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
129 Rather we simply define functions each way.
130
131 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
132 returning the rest of the 'Text' at the first non-ascii 'Char'
133-}
134
135encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
136encodeAscii = go where
137 go p = do e <- lift (next p)
138 case e of
139 Left r -> return (return r)
140 Right (chunk, p') ->
141 if T.null chunk
142 then go p'
143 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
144 in do yield (B8.pack (T.unpack safe))
145 if T.null unsafe
146 then go p'
147 else return $ do yield unsafe
148 p'
149
150{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
151 returning the rest of the 'Text' upon hitting any non-latin 'Char'
152 -}
153encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
154encodeIso8859_1 = go where
155 go p = do e <- lift (next p)
156 case e of
157 Left r -> return (return r)
158 Right (txt, p') ->
159 if T.null txt
160 then go p'
161 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
162 in do yield (B8.pack (T.unpack safe))
163 if T.null unsafe
164 then go p'
165 else return $ do yield unsafe
166 p'
167
168{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
169 unused 'ByteString' upon hitting an un-ascii byte.
170 -}
171decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
172decodeAscii = go where
173 go p = do e <- lift (next p)
174 case e of
175 Left r -> return (return r)
176 Right (chunk, p') ->
177 if B.null chunk
178 then go p'
179 else let (safe, unsafe) = B.span (<= 0x7F) chunk
180 in do yield (T.pack (B8.unpack safe))
181 if B.null unsafe
182 then go p'
183 else return (do yield unsafe
184 p')
185
186{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
187 unused 'ByteString' upon hitting the rare un-latinizable byte.
188 -}
189decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
190decodeIso8859_1 = go where
191 go p = do e <- lift (next p)
192 case e of
193 Left r -> return (return r)
194 Right (chunk, p') ->
195 if B.null chunk
196 then go p'
197 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
198 yield (T.pack (B8.unpack safe))
199 if B.null unsafe
200 then go p'
201 else return (do yield unsafe
202 p')
203
204
205
diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs
new file mode 100644
index 0000000..3c9ac98
--- /dev/null
+++ b/Pipes/Text/IO.hs
@@ -0,0 +1,96 @@
1{-#LANGUAGE RankNTypes#-}
2
3module Pipes.Text.IO
4 ( stdin
5 , stdout
6 , fromHandle
7 , toHandle
8 , readFile
9 , writeFile
10 ) where
11
12import qualified System.IO as IO
13import Control.Exception (throwIO, try)
14import Foreign.C.Error (Errno(Errno), ePIPE)
15import qualified GHC.IO.Exception as G
16import Data.Text (Text)
17import qualified Data.Text as T
18import qualified Data.Text.IO as T
19import Pipes
20import qualified Pipes.Safe.Prelude as Safe
21import qualified Pipes.Safe as Safe
22import Pipes.Safe (MonadSafe(..), Base(..))
23import Prelude hiding (readFile, writeFile)
24
25-- | Stream text from 'stdin'
26stdin :: MonadIO m => Producer Text m ()
27stdin = fromHandle IO.stdin
28{-# INLINE stdin #-}
29
30{-| Convert a 'IO.Handle' into a text stream using a text size
31 determined by the good sense of the text library; note that this
32 is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
33 but uses the system encoding and has other `Data.Text.IO` features
34-}
35
36fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
37fromHandle h = go where
38 go = do txt <- liftIO (T.hGetChunk h)
39 if T.null txt then return ()
40 else do yield txt
41 go
42{-# INLINABLE fromHandle#-}
43
44
45{-| Stream text from a file in the simple fashion of @Data.Text.IO@
46
47>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
48MAIN = PUTSTRLN "HELLO WORLD"
49-}
50
51readFile :: MonadSafe m => FilePath -> Producer Text m ()
52readFile file = Safe.withFile file IO.ReadMode fromHandle
53{-# INLINE readFile #-}
54
55
56{-| Stream text to 'stdout'
57
58 Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
59
60 Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@
61 instead of @(source >-> stdout)@ .
62-}
63stdout :: MonadIO m => Consumer' Text m ()
64stdout = go
65 where
66 go = do
67 txt <- await
68 x <- liftIO $ try (T.putStr txt)
69 case x of
70 Left (G.IOError { G.ioe_type = G.ResourceVanished
71 , G.ioe_errno = Just ioe })
72 | Errno ioe == ePIPE
73 -> return ()
74 Left e -> liftIO (throwIO e)
75 Right () -> go
76{-# INLINABLE stdout #-}
77
78
79{-| Convert a text stream into a 'Handle'
80
81 Note: again, for best performance, where possible use
82 @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@.
83-}
84toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r
85toHandle h = for cat (liftIO . T.hPutStr h)
86{-# INLINABLE toHandle #-}
87
88{-# RULES "p >-> toHandle h" forall p h .
89 p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt))
90 #-}
91
92
93-- | Stream text into a file. Uses @pipes-safe@.
94writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
95writeFile file = Safe.withFile file IO.WriteMode toHandle
96{-# INLINE writeFile #-}
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
deleted file mode 100644
index 582ef14..0000000
--- a/Pipes/Text/Internal.hs
+++ /dev/null
@@ -1,7 +0,0 @@
1module Pipes.Text.Internal
2 (module Pipes.Text.Internal.Codec
3 , module Pipes.Text.Internal.Decoding
4 ) where
5
6import Pipes.Text.Internal.Codec
7import Pipes.Text.Internal.Decoding \ No newline at end of file
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
deleted file mode 100644
index 075a152..0000000
--- a/Pipes/Text/Internal/Codec.hs
+++ /dev/null
@@ -1,216 +0,0 @@
1
2{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5-- License: MIT
6-- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
7
8-- This module follows the model of the enumerator and conduits libraries, and defines
9-- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
10-- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
11-- special functions appear in @Pipes.Text@
12
13
14module Pipes.Text.Internal.Codec
15 ( Codec(..)
16 , TextException(..)
17 , utf8
18 , utf16_le
19 , utf16_be
20 , utf32_le
21 , utf32_be
22 ) where
23
24import Data.Bits ((.&.))
25import Data.Char (ord)
26import Data.ByteString as B
27import Data.ByteString (ByteString)
28import Data.ByteString.Internal as B
29import Data.ByteString.Char8 as B8
30import Data.Text (Text)
31import qualified Data.Text as T
32import qualified Data.Text.Encoding as TE
33import Data.Text.Encoding.Error ()
34import GHC.Word (Word8, Word32)
35import qualified Data.Text.Array as A
36import Data.Word (Word8, Word16)
37import System.IO.Unsafe (unsafePerformIO)
38import qualified Control.Exception as Exc
39import Data.Bits ((.&.), (.|.), shiftL)
40import Data.Typeable
41import Control.Arrow (first)
42import Data.Maybe (catMaybes)
43import Pipes.Text.Internal.Decoding
44import Pipes
45-- | A specific character encoding.
46
47data Codec = Codec
48 { codecName :: Text
49 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
50 , codecDecode :: ByteString -> Decoding
51 }
52
53instance Show Codec where
54 showsPrec d c = showParen (d > 10) $
55 showString "Codec " . shows (codecName c)
56
57data TextException = DecodeException Codec Word8
58 | EncodeException Codec Char
59 | LengthExceeded Int
60 | TextException Exc.SomeException
61 deriving (Show, Typeable)
62instance Exc.Exception TextException
63
64
65toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
66 -> (ByteString -> Decoding)
67toDecoding op = loop B.empty where
68 loop !extra bs0 = case op (B.append extra bs0) of
69 (txt, Right bs) -> Some txt bs (loop bs)
70 (txt, Left (_,bs)) -> Other txt bs
71-- To do: toDecoding should be inlined in each of the 'Codec' definitions
72-- or else Codec changed to the conduit/enumerator definition. We have
73-- altered it to use 'streamDecodeUtf8'
74
75splitSlowly :: (ByteString -> Text)
76 -> ByteString
77 -> (Text, Either (TextException, ByteString) ByteString)
78splitSlowly dec bytes = valid where
79 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
80 splits 0 = [(B.empty, bytes)]
81 splits n = B.splitAt n bytes : splits (n - 1)
82 decFirst (a, b) = case tryEvaluate (dec a) of
83 Left _ -> Nothing
84 Right text -> let trouble = case tryEvaluate (dec b) of
85 Left exc -> Left (TextException exc, b)
86 Right _ -> Right B.empty
87 in Just (text, trouble) -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
96
97-- -- Whether the given byte is a continuation byte.
98-- isContinuation byte = byte .&. 0xC0 == 0x80
99--
100-- -- The number of continuation bytes needed by the given
101-- -- non-continuation byte. Returns -1 for an illegal UTF-8
102-- -- non-continuation byte and the whole split quickly must fail so
103-- -- as the input is passed to TE.decodeUtf8, which will issue a
104-- -- suitable error.
105-- required x0
106-- | x0 .&. 0x80 == 0x00 = 0
107-- | x0 .&. 0xE0 == 0xC0 = 1
108-- | x0 .&. 0xF0 == 0xE0 = 2
109-- | x0 .&. 0xF8 == 0xF0 = 3
110-- | otherwise = -1
111--
112-- splitQuickly bytes
113-- | B.null l || req == -1 = Nothing
114-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
115-- | otherwise = Just (TE.decodeUtf8 l', r')
116-- where
117-- (l, r) = B.spanEnd isContinuation bytes
118-- req = required (B.last l)
119-- l' = B.init l
120-- r' = B.cons (B.last l) r
121
122
123utf16_le :: Codec
124utf16_le = Codec name enc (toDecoding dec) where
125 name = T.pack "UTF-16-LE"
126 enc text = (TE.encodeUtf16LE text, Nothing)
127 dec bytes = case splitQuickly bytes of
128 Just (text, extra) -> (text, Right extra)
129 Nothing -> splitSlowly TE.decodeUtf16LE bytes
130
131 splitQuickly bytes = maybeDecode (loop 0) where
132 maxN = B.length bytes
133
134 loop n | n == maxN = decodeAll
135 | (n + 1) == maxN = decodeTo n
136 loop n = let
137 req = utf16Required
138 (B.index bytes n)
139 (B.index bytes (n + 1))
140 decodeMore = loop $! n + req
141 in if n + req > maxN
142 then decodeTo n
143 else decodeMore
144
145 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
146 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
147
148utf16_be :: Codec
149utf16_be = Codec name enc (toDecoding dec) where
150 name = T.pack "UTF-16-BE"
151 enc text = (TE.encodeUtf16BE text, Nothing)
152 dec bytes = case splitQuickly bytes of
153 Just (text, extra) -> (text, Right extra)
154 Nothing -> splitSlowly TE.decodeUtf16BE bytes
155
156 splitQuickly bytes = maybeDecode (loop 0) where
157 maxN = B.length bytes
158
159 loop n | n == maxN = decodeAll
160 | (n + 1) == maxN = decodeTo n
161 loop n = let
162 req = utf16Required
163 (B.index bytes (n + 1))
164 (B.index bytes n)
165 decodeMore = loop $! n + req
166 in if n + req > maxN
167 then decodeTo n
168 else decodeMore
169
170 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
171 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
172
173utf16Required :: Word8 -> Word8 -> Int
174utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
175 x :: Word16
176 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
177
178
179utf32_le :: Codec
180utf32_le = Codec name enc (toDecoding dec) where
181 name = T.pack "UTF-32-LE"
182 enc text = (TE.encodeUtf32LE text, Nothing)
183 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
184 Just (text, extra) -> (text, Right extra)
185 Nothing -> splitSlowly TE.decodeUtf32LE bs
186
187
188utf32_be :: Codec
189utf32_be = Codec name enc (toDecoding dec) where
190 name = T.pack "UTF-32-BE"
191 enc text = (TE.encodeUtf32BE text, Nothing)
192 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
193 Just (text, extra) -> (text, Right extra)
194 Nothing -> splitSlowly TE.decodeUtf32BE bs
195
196utf32SplitBytes :: (ByteString -> Text)
197 -> ByteString
198 -> Maybe (Text, ByteString)
199utf32SplitBytes dec bytes = split where
200 split = maybeDecode (dec toDecode, extra)
201 len = B.length bytes
202 lenExtra = mod len 4
203
204 lenToDecode = len - lenExtra
205 (toDecode, extra) = if lenExtra == 0
206 then (bytes, B.empty)
207 else B.splitAt lenToDecode bytes
208
209
210tryEvaluate :: a -> Either Exc.SomeException a
211tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
212
213maybeDecode :: (a, b) -> Maybe (a, b)
214maybeDecode (a, b) = case tryEvaluate a of
215 Left _ -> Nothing
216 Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
deleted file mode 100644
index b5d928a..0000000
--- a/Pipes/Text/Internal/Decoding.hs
+++ /dev/null
@@ -1,154 +0,0 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5-- This module lifts assorted materials from Brian O'Sullivan's text package
6-- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
7-- 'streamDecodeUtf8'
8
9module Pipes.Text.Internal.Decoding
10 ( Decoding(..)
11 , streamDecodeUtf8
12 , decodeSomeUtf8
13 ) where
14import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15import Control.Monad.ST (ST, runST)
16import Data.Bits ((.&.))
17import Data.ByteString as B
18import Data.ByteString (ByteString)
19import Data.ByteString.Internal as B
20import Data.ByteString.Char8 as B8
21import Data.Text (Text)
22import qualified Data.Text as T
23import qualified Data.Text.Encoding as TE
24import Data.Text.Encoding.Error ()
25import Data.Text.Internal (Text, textP)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
32import GHC.Word (Word8, Word32)
33import qualified Data.Text.Array as A
34import Data.Word (Word8, Word16)
35import System.IO.Unsafe (unsafePerformIO)
36import qualified Control.Exception as Exc
37import Data.Bits ((.&.), (.|.), shiftL)
38import Data.Typeable
39import Control.Arrow (first)
40import Data.Maybe (catMaybes)
41#include "pipes_text_cbits.h"
42
43
44
45-- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
46
47data Decoding = Some Text ByteString (ByteString -> Decoding)
48 -- Text, continuation and any undecoded fragment.
49 | Other Text ByteString
50 -- Text followed by an undecodable ByteString
51
52instance Show Decoding where
53 showsPrec d (Some t bs _) = showParen (d > prec) $
54 showString "Some " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
56 showString " _"
57 where prec = 10; prec' = prec + 1
58 showsPrec d (Other t bs) = showParen (d > prec) $
59 showString "Other " . showsPrec prec' t .
60 showChar ' ' . showsPrec prec' bs .
61 showString " _"
62 where prec = 10; prec' = prec + 1
63
64newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
65newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
66
67-- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'.
68streamDecodeUtf8 :: ByteString -> Decoding
69streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
70 where
71 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
72 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
73 runST $ do marray <- A.new (len+1)
74 unsafeIOToST (decodeChunkToBuffer marray)
75 where
76 decodeChunkToBuffer :: A.MArray s -> IO Decoding
77 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
78 with (0::CSize) $ \destOffPtr ->
79 with codepoint0 $ \codepointPtr ->
80 with state0 $ \statePtr ->
81 with nullPtr $ \curPtrPtr ->
82 do let end = ptr `plusPtr` (off + len)
83 curPtr = ptr `plusPtr` off
84 poke curPtrPtr curPtr
85 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
86 state <- peek statePtr
87 lastPtr <- peek curPtrPtr
88 codepoint <- peek codepointPtr
89 n <- peek destOffPtr
90 chunkText <- mkText dest n
91 let left = lastPtr `minusPtr` curPtr
92 remaining = B.drop left bs
93 accum = if T.null chunkText then B.append old remaining else remaining
94 return $! case state of
95 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
96 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
97 {-# INLINE decodeChunkToBuffer #-}
98 {-# INLINE decodeChunkUtf8 #-}
99{-# INLINE streamDecodeUtf8 #-}
100
101-- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
102decodeSomeUtf8 :: ByteString -> (Text, ByteString)
103decodeSomeUtf8 bs@(PS fp off len) = runST $ do
104 dest <- A.new (len+1)
105 unsafeIOToST $
106 withForeignPtr fp $ \ptr ->
107 with (0::CSize) $ \destOffPtr ->
108 with (0::CodePoint) $ \codepointPtr ->
109 with (0::DecoderState) $ \statePtr ->
110 with nullPtr $ \curPtrPtr ->
111 do let end = ptr `plusPtr` (off + len)
112 curPtr = ptr `plusPtr` off
113 poke curPtrPtr curPtr
114 c_decode_utf8_with_state (A.maBA dest) destOffPtr
115 curPtrPtr end codepointPtr statePtr
116 state <- peek statePtr
117 lastPtr <- peek curPtrPtr
118 codepoint <- peek codepointPtr
119 n <- peek destOffPtr
120 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
121 return $! textP arr 0 (fromIntegral n)
122 let left = lastPtr `minusPtr` curPtr
123 remaining = B.drop left bs
124 return $! (chunkText, remaining)
125{-# INLINE decodeSomeUtf8 #-}
126
127mkText :: A.MArray s -> CSize -> IO Text
128mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
129 return $! textP arr 0 (fromIntegral n)
130{-# INLINE mkText #-}
131
132ord :: Char -> Int
133ord (C# c#) = I# (ord# c#)
134{-# INLINE ord #-}
135
136unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
137unsafeWrite marr i c
138 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
139 return 1
140 | otherwise = do A.unsafeWrite marr i lo
141 A.unsafeWrite marr (i+1) hi
142 return 2
143 where n = ord c
144 m = n - 0x10000
145 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
146 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
147 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
148 {-# INLINE shiftR #-}
149{-# INLINE unsafeWrite #-}
150
151foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
152 :: MutableByteArray# s -> Ptr CSize
153 -> Ptr (Ptr Word8) -> Ptr Word8
154 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file
diff --git a/cbits/cbits.c b/cbits/cbits.c
deleted file mode 100644
index c11645b..0000000
--- a/cbits/cbits.c
+++ /dev/null
@@ -1,168 +0,0 @@
1/*
2 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
3 *
4 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
5 *
6 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7 */
8
9#include <string.h>
10#include <stdint.h>
11#include <stdio.h>
12#include "pipes_text_cbits.h"
13
14
15
16#define UTF8_ACCEPT 0
17#define UTF8_REJECT 12
18
19static const uint8_t utf8d[] = {
20 /*
21 * The first part of the table maps bytes to character classes that
22 * to reduce the size of the transition table and create bitmasks.
23 */
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
29 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
30 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
32
33 /*
34 * The second part is a transition table that maps a combination of
35 * a state of the automaton and a character class to a state.
36 */
37 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
38 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
39 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
40 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
41 12,36,12,12,12,12,12,12,12,12,12,12,
42};
43
44static inline uint32_t
45decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
46 uint32_t type = utf8d[byte];
47
48 *codep = (*state != UTF8_ACCEPT) ?
49 (byte & 0x3fu) | (*codep << 6) :
50 (0xff >> type) & (byte);
51
52 return *state = utf8d[256 + *state + type];
53}
54
55/*
56 * A best-effort decoder. Runs until it hits either end of input or
57 * the start of an invalid byte sequence.
58 *
59 * At exit, we update *destoff with the next offset to write to, *src
60 * with the next source location past the last one successfully
61 * decoded, and return the next source location to read from.
62 *
63 * Moreover, we expose the internal decoder state (state0 and
64 * codepoint0), allowing one to restart the decoder after it
65 * terminates (say, due to a partial codepoint).
66 *
67 * In particular, there are a few possible outcomes,
68 *
69 * 1) We decoded the buffer entirely:
70 * In this case we return srcend
71 * state0 == UTF8_ACCEPT
72 *
73 * 2) We met an invalid encoding
74 * In this case we return the address of the first invalid byte
75 * state0 == UTF8_REJECT
76 *
77 * 3) We reached the end of the buffer while decoding a codepoint
78 * In this case we return a pointer to the first byte of the partial codepoint
79 * state0 != UTF8_ACCEPT, UTF8_REJECT
80 *
81 */
82
83 #if defined(__GNUC__) || defined(__clang__)
84 static inline uint8_t const *
85 _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
86 const uint8_t const **src, const uint8_t const *srcend,
87 uint32_t *codepoint0, uint32_t *state0)
88 __attribute((always_inline));
89 #endif
90
91static inline uint8_t const *
92_hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
93 const uint8_t const **src, const uint8_t const *srcend,
94 uint32_t *codepoint0, uint32_t *state0)
95{
96 uint16_t *d = dest + *destoff;
97 const uint8_t *s = *src, *last = *src;
98 uint32_t state = *state0;
99 uint32_t codepoint = *codepoint0;
100
101 while (s < srcend) {
102#if defined(__i386__) || defined(__x86_64__)
103 /*
104 * This code will only work on a little-endian system that
105 * supports unaligned loads.
106 *
107 * It gives a substantial speed win on data that is purely or
108 * partly ASCII (e.g. HTML), at only a slight cost on purely
109 * non-ASCII text.
110 */
111
112 if (state == UTF8_ACCEPT) {
113 while (s < srcend - 4) {
114 codepoint = *((uint32_t *) s);
115 if ((codepoint & 0x80808080) != 0)
116 break;
117 s += 4;
118
119 /*
120 * Tried 32-bit stores here, but the extra bit-twiddling
121 * slowed the code down.
122 */
123
124 *d++ = (uint16_t) (codepoint & 0xff);
125 *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
126 *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
127 *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
128 }
129 last = s;
130 }
131#endif
132
133 if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
134 if (state != UTF8_REJECT)
135 continue;
136 break;
137 }
138
139 if (codepoint <= 0xffff)
140 *d++ = (uint16_t) codepoint;
141 else {
142 *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
143 *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
144 }
145 last = s;
146 }
147
148 *destoff = d - dest;
149 *codepoint0 = codepoint;
150 *state0 = state;
151 *src = last;
152
153 return s;
154}
155
156uint8_t const *
157_hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
158 const uint8_t const **src,
159 const uint8_t const *srcend,
160 uint32_t *codepoint0, uint32_t *state0)
161{
162 uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend,
163 codepoint0, state0);
164 if (*state0 == UTF8_REJECT)
165 ret -=1;
166 return ret;
167}
168
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h
deleted file mode 100644
index b9ab670..0000000
--- a/include/pipes_text_cbits.h
+++ /dev/null
@@ -1,11 +0,0 @@
1/*
2 * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
3 */
4
5#ifndef _pipes_text_cbits_h
6#define _pipes_text_cbits_h
7
8#define UTF8_ACCEPT 0
9#define UTF8_REJECT 12
10
11#endif
diff --git a/pipes-text.cabal b/pipes-text.cabal
index a1d57bb..017d41c 100644
--- a/pipes-text.cabal
+++ b/pipes-text.cabal
@@ -1,5 +1,5 @@
1name: pipes-text 1name: pipes-text
2version: 0.0.0.6 2version: 0.0.0.7
3synopsis: Text pipes. 3synopsis: Text pipes.
4description: Many of the pipes and other operations defined here mirror those in 4description: Many of the pipes and other operations defined here mirror those in
5 the `pipes-bytestring` library. Folds like `length` and grouping 5 the `pipes-bytestring` library. Folds like `length` and grouping
@@ -36,23 +36,28 @@ source-repository head
36 type: git 36 type: git
37 location: https://github.com/michaelt/text-pipes 37 location: https://github.com/michaelt/text-pipes
38 38
39flag noio
40 default: False
41 Description: Use a version of text earlier than 0.11.3
39 42
40library 43library
41 c-sources: cbits/cbits.c 44 exposed-modules: Pipes.Text, Pipes.Text.Encoding
42 include-dirs: include 45 build-depends: base >= 4 && < 5 ,
43 exposed-modules: Pipes.Text, Pipes.Text.Internal 46 bytestring >= 0.9 ,
44 other-modules: Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec 47 text >=0.11.3 && < 1.2,
45 other-extensions: RankNTypes 48 text-stream-decode >= 0.1 && < 0.2,
46 build-depends: base >= 4 && < 5 , 49 profunctors >= 3.1.1 && < 4.1,
47 bytestring >=0.10 && < 0.11, 50 pipes >=4.0 && < 4.2,
48 text >=0.11.3 && < 1.2, 51 pipes-group >= 1.0.0 && < 1.1,
49 profunctors >= 3.1.1 && < 4.1 , 52 pipes-parse >=2.0 && < 3.1,
50 pipes >=4.0 && < 4.2,
51 pipes-group >= 1.0.0 && < 1.1 ,
52 pipes-parse >=2.0 && < 3.1,
53 pipes-safe, 53 pipes-safe,
54 pipes-bytestring >= 1.0 && < 2.1, 54 pipes-bytestring >= 1.0 && < 2.1,
55 transformers >= 0.2.0.0 && < 0.4 55 transformers >= 0.2.0.0 && < 0.4
56 -- hs-source-dirs: 56 other-extensions: RankNTypes
57 default-language: Haskell2010 57 default-language: Haskell2010
58 ghc-options: -O2 58 ghc-options: -O2
59
60 if !flag(noio)
61 exposed-modules: Pipes.Text.IO
62 build-depends: text >=0.11.3 && < 1.2
63