diff options
author | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-02-15 10:01:48 -0500 |
---|---|---|
committer | michaelt <what_is_it_to_do_anything@yahoo.com> | 2014-02-15 10:01:48 -0500 |
commit | bbdfd3056da4992e18d3983fd5992bee23af93af (patch) | |
tree | 5765f3f224708890cf50b400c675c23e759595a2 | |
parent | fd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff) | |
download | text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip |
use new text-stream-decoding
-rw-r--r-- | Pipes/Text.hs | 269 | ||||
-rw-r--r-- | Pipes/Text/Encoding.hs | 205 | ||||
-rw-r--r-- | Pipes/Text/IO.hs | 96 | ||||
-rw-r--r-- | Pipes/Text/Internal.hs | 7 | ||||
-rw-r--r-- | Pipes/Text/Internal/Codec.hs | 216 | ||||
-rw-r--r-- | Pipes/Text/Internal/Decoding.hs | 154 | ||||
-rw-r--r-- | cbits/cbits.c | 168 | ||||
-rw-r--r-- | include/pipes_text_cbits.h | 11 | ||||
-rw-r--r-- | pipes-text.cabal | 39 |
9 files changed, 357 insertions, 808 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 4b2d2b0..8221c01 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -10,7 +10,8 @@ | |||
10 | example, the following program copies a document from one file to another: | 10 | example, the following program copies a document from one file to another: |
11 | 11 | ||
12 | > import Pipes | 12 | > import Pipes |
13 | > import qualified Data.Text.Pipes as Text | 13 | > import qualified Pipes.Text as Text |
14 | > import qualified Pipes.Text.IO as Text | ||
14 | > import System.IO | 15 | > import System.IO |
15 | > | 16 | > |
16 | > main = | 17 | > main = |
@@ -21,7 +22,8 @@ | |||
21 | To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): | 22 | To stream from files, the following is perhaps more Prelude-like (note that it uses Pipes.Safe): |
22 | 23 | ||
23 | > import Pipes | 24 | > import Pipes |
24 | > import qualified Data.Text.Pipes as Text | 25 | > import qualified Pipes.Text as Text |
26 | > import qualified Pipes.Text.IO as Text | ||
25 | > import Pipes.Safe | 27 | > import Pipes.Safe |
26 | > | 28 | > |
27 | > main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" | 29 | > main = runSafeT $ runEffect $ Text.readFile "inFile.txt" >-> Text.writeFile "outFile.txt" |
@@ -61,14 +63,14 @@ To stream from files, the following is perhaps more Prelude-like (note that it u | |||
61 | module Pipes.Text ( | 63 | module Pipes.Text ( |
62 | -- * Producers | 64 | -- * Producers |
63 | fromLazy | 65 | fromLazy |
64 | , stdin | 66 | -- , stdin |
65 | , fromHandle | 67 | -- , fromHandle |
66 | , readFile | 68 | -- , readFile |
67 | 69 | ||
68 | -- * Consumers | 70 | -- * Consumers |
69 | , stdout | 71 | -- , stdout |
70 | , toHandle | 72 | -- , toHandle |
71 | , writeFile | 73 | -- , writeFile |
72 | 74 | ||
73 | -- * Pipes | 75 | -- * Pipes |
74 | , map | 76 | , map |
@@ -79,7 +81,7 @@ module Pipes.Text ( | |||
79 | , dropWhile | 81 | , dropWhile |
80 | , filter | 82 | , filter |
81 | , scan | 83 | , scan |
82 | , encodeUtf8 | 84 | -- , encodeUtf8 |
83 | , pack | 85 | , pack |
84 | , unpack | 86 | , unpack |
85 | , toCaseFold | 87 | , toCaseFold |
@@ -120,22 +122,22 @@ module Pipes.Text ( | |||
120 | , word | 122 | , word |
121 | , line | 123 | , line |
122 | 124 | ||
123 | -- * Decoding Lenses | 125 | -- -- * Decoding Lenses |
124 | , decodeUtf8 | 126 | -- , decodeUtf8 |
125 | , codec | 127 | -- , codec |
126 | 128 | -- | |
127 | -- * Codecs | 129 | -- -- * Codecs |
128 | , utf8 | 130 | -- , utf8 |
129 | , utf16_le | 131 | -- , utf16_le |
130 | , utf16_be | 132 | -- , utf16_be |
131 | , utf32_le | 133 | -- , utf32_le |
132 | , utf32_be | 134 | -- , utf32_be |
133 | 135 | -- | |
134 | -- * Other Decoding/Encoding Functions | 136 | -- -- * Other Decoding/Encoding Functions |
135 | , decodeIso8859_1 | 137 | -- , decodeIso8859_1 |
136 | , decodeAscii | 138 | -- , decodeAscii |
137 | , encodeIso8859_1 | 139 | -- , encodeIso8859_1 |
138 | , encodeAscii | 140 | -- , encodeAscii |
139 | 141 | ||
140 | -- * FreeT Splitters | 142 | -- * FreeT Splitters |
141 | , chunksOf | 143 | , chunksOf |
@@ -157,11 +159,9 @@ module Pipes.Text ( | |||
157 | 159 | ||
158 | -- * Re-exports | 160 | -- * Re-exports |
159 | -- $reexports | 161 | -- $reexports |
160 | , Decoding(..) | 162 | -- , DecodeResult(..) |
161 | , streamDecodeUtf8 | 163 | -- , Codec |
162 | , decodeSomeUtf8 | 164 | -- , TextException(..) |
163 | , Codec(..) | ||
164 | , TextException(..) | ||
165 | , module Data.ByteString | 165 | , module Data.ByteString |
166 | , module Data.Text | 166 | , module Data.Text |
167 | , module Data.Profunctor | 167 | , module Data.Profunctor |
@@ -170,7 +170,6 @@ module Pipes.Text ( | |||
170 | , module Pipes.Group | 170 | , module Pipes.Group |
171 | ) where | 171 | ) where |
172 | 172 | ||
173 | import Control.Exception (throwIO, try) | ||
174 | import Control.Applicative ((<*)) | 173 | import Control.Applicative ((<*)) |
175 | import Control.Monad (liftM, unless, join) | 174 | import Control.Monad (liftM, unless, join) |
176 | import Control.Monad.Trans.State.Strict (StateT(..), modify) | 175 | import Control.Monad.Trans.State.Strict (StateT(..), modify) |
@@ -193,24 +192,20 @@ import Data.Functor.Identity (Identity) | |||
193 | import Data.Profunctor (Profunctor) | 192 | import Data.Profunctor (Profunctor) |
194 | import qualified Data.Profunctor | 193 | import qualified Data.Profunctor |
195 | import qualified Data.List as List | 194 | import qualified Data.List as List |
196 | import Foreign.C.Error (Errno(Errno), ePIPE) | ||
197 | import qualified GHC.IO.Exception as G | ||
198 | import Pipes | 195 | import Pipes |
199 | import qualified Pipes.ByteString as PB | 196 | import qualified Pipes.ByteString as PB |
200 | import qualified Pipes.Text.Internal as PI | 197 | -- import Pipes.Text.Decoding |
201 | import Pipes.Text.Internal | ||
202 | import Pipes.Core (respond, Server') | 198 | import Pipes.Core (respond, Server') |
203 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) | 199 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) |
204 | import qualified Pipes.Group as PG | 200 | import qualified Pipes.Group as PG |
205 | import qualified Pipes.Parse as PP | 201 | import qualified Pipes.Parse as PP |
206 | import Pipes.Parse (Parser) | 202 | import Pipes.Parse (Parser) |
207 | import qualified Pipes.Safe.Prelude as Safe | 203 | |
208 | import qualified Pipes.Safe as Safe | ||
209 | import Pipes.Safe (MonadSafe(..), Base(..)) | ||
210 | import qualified Pipes.Prelude as P | 204 | import qualified Pipes.Prelude as P |
211 | import qualified System.IO as IO | 205 | import qualified System.IO as IO |
212 | import Data.Char (isSpace) | 206 | import Data.Char (isSpace) |
213 | import Data.Word (Word8) | 207 | import Data.Word (Word8) |
208 | import Data.Text.StreamDecoding | ||
214 | 209 | ||
215 | import Prelude hiding ( | 210 | import Prelude hiding ( |
216 | all, | 211 | all, |
@@ -246,78 +241,6 @@ fromLazy :: (Monad m) => TL.Text -> Producer' Text m () | |||
246 | fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) | 241 | fromLazy = foldrChunks (\e a -> yield e >> a) (return ()) |
247 | {-# INLINE fromLazy #-} | 242 | {-# INLINE fromLazy #-} |
248 | 243 | ||
249 | -- | Stream text from 'stdin' | ||
250 | stdin :: MonadIO m => Producer Text m () | ||
251 | stdin = fromHandle IO.stdin | ||
252 | {-# INLINE stdin #-} | ||
253 | |||
254 | {-| Convert a 'IO.Handle' into a text stream using a text size | ||
255 | determined by the good sense of the text library; note that this | ||
256 | is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ | ||
257 | but uses the system encoding and has other `Data.Text.IO` features | ||
258 | -} | ||
259 | |||
260 | fromHandle :: MonadIO m => IO.Handle -> Producer Text m () | ||
261 | fromHandle h = go where | ||
262 | go = do txt <- liftIO (T.hGetChunk h) | ||
263 | unless (T.null txt) ( do yield txt | ||
264 | go ) | ||
265 | {-# INLINABLE fromHandle#-} | ||
266 | |||
267 | |||
268 | {-| Stream text from a file in the simple fashion of @Data.Text.IO@ | ||
269 | |||
270 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout | ||
271 | MAIN = PUTSTRLN "HELLO WORLD" | ||
272 | -} | ||
273 | |||
274 | readFile :: MonadSafe m => FilePath -> Producer Text m () | ||
275 | readFile file = Safe.withFile file IO.ReadMode fromHandle | ||
276 | {-# INLINE readFile #-} | ||
277 | |||
278 | |||
279 | {-| Stream text to 'stdout' | ||
280 | |||
281 | Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. | ||
282 | |||
283 | Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ | ||
284 | instead of @(source >-> stdout)@ . | ||
285 | -} | ||
286 | stdout :: MonadIO m => Consumer' Text m () | ||
287 | stdout = go | ||
288 | where | ||
289 | go = do | ||
290 | txt <- await | ||
291 | x <- liftIO $ try (T.putStr txt) | ||
292 | case x of | ||
293 | Left (G.IOError { G.ioe_type = G.ResourceVanished | ||
294 | , G.ioe_errno = Just ioe }) | ||
295 | | Errno ioe == ePIPE | ||
296 | -> return () | ||
297 | Left e -> liftIO (throwIO e) | ||
298 | Right () -> go | ||
299 | {-# INLINABLE stdout #-} | ||
300 | |||
301 | |||
302 | {-| Convert a text stream into a 'Handle' | ||
303 | |||
304 | Note: again, for best performance, where possible use | ||
305 | @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@. | ||
306 | -} | ||
307 | toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r | ||
308 | toHandle h = for cat (liftIO . T.hPutStr h) | ||
309 | {-# INLINABLE toHandle #-} | ||
310 | |||
311 | {-# RULES "p >-> toHandle h" forall p h . | ||
312 | p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt)) | ||
313 | #-} | ||
314 | |||
315 | |||
316 | -- | Stream text into a file. Uses @pipes-safe@. | ||
317 | writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () | ||
318 | writeFile file = Safe.withFile file IO.WriteMode toHandle | ||
319 | {-# INLINE writeFile #-} | ||
320 | |||
321 | 244 | ||
322 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) | 245 | type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) |
323 | 246 | ||
@@ -690,28 +613,6 @@ isEndOfChars = do | |||
690 | {-# INLINABLE isEndOfChars #-} | 613 | {-# INLINABLE isEndOfChars #-} |
691 | 614 | ||
692 | 615 | ||
693 | {- | An improper lens into a stream of 'ByteString' expected to be UTF-8 encoded; the associated | ||
694 | stream of Text ends by returning a stream of ByteStrings beginning at the point of failure. | ||
695 | -} | ||
696 | |||
697 | decodeUtf8 :: Monad m => Lens' (Producer ByteString m r) | ||
698 | (Producer Text m (Producer ByteString m r)) | ||
699 | decodeUtf8 k p0 = fmap (\p -> join (for p (yield . TE.encodeUtf8))) | ||
700 | (k (go B.empty PI.streamDecodeUtf8 p0)) where | ||
701 | go !carry dec0 p = do | ||
702 | x <- lift (next p) | ||
703 | case x of Left r -> return (if B.null carry | ||
704 | then return r -- all bytestring input was consumed | ||
705 | else (do yield carry -- a potentially valid fragment remains | ||
706 | return r)) | ||
707 | |||
708 | Right (chunk, p') -> case dec0 chunk of | ||
709 | PI.Some text carry2 dec -> do yield text | ||
710 | go carry2 dec p' | ||
711 | PI.Other text bs -> do yield text | ||
712 | return (do yield bs -- an invalid blob remains | ||
713 | p') | ||
714 | {-# INLINABLE decodeUtf8 #-} | ||
715 | 616 | ||
716 | 617 | ||
717 | -- | Splits a 'Producer' after the given number of characters | 618 | -- | Splits a 'Producer' after the given number of characters |
@@ -1057,106 +958,4 @@ unwords = intercalate (yield $ T.singleton ' ') | |||
1057 | @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. | 958 | @Pipes.Parse@ re-exports 'input', 'concat', 'FreeT' (the type) and the 'Parse' synonym. |
1058 | -} | 959 | -} |
1059 | 960 | ||
1060 | {- | Use a 'Codec' as a pipes-style 'Lens' into a byte stream; the available 'Codec' s are | 961 | |
1061 | 'utf8', 'utf16_le', 'utf16_be', 'utf32_le', 'utf32_be' . The 'Codec' concept and the | ||
1062 | individual 'Codec' definitions follow the enumerator and conduit libraries. | ||
1063 | |||
1064 | Utf8 is handled differently in this library -- without the use of 'unsafePerformIO' &co | ||
1065 | to catch 'Text' exceptions; but the same 'mypipe ^. codec utf8' interface can be used. | ||
1066 | 'mypipe ^. decodeUtf8' should be the same, but has a somewhat more direct and thus perhaps | ||
1067 | better implementation. | ||
1068 | |||
1069 | -} | ||
1070 | codec :: Monad m => Codec -> Lens' (Producer ByteString m r) (Producer Text m (Producer ByteString m r)) | ||
1071 | codec (Codec _ enc dec) k p0 = fmap (\p -> join (for p (yield . fst . enc))) | ||
1072 | (k (decoder (dec B.empty) p0) ) where | ||
1073 | decoder :: Monad m => PI.Decoding -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
1074 | decoder !d p0 = case d of | ||
1075 | PI.Other txt bad -> do yield txt | ||
1076 | return (do yield bad | ||
1077 | p0) | ||
1078 | PI.Some txt extra dec -> do yield txt | ||
1079 | x <- lift (next p0) | ||
1080 | case x of Left r -> return (do yield extra | ||
1081 | return r) | ||
1082 | Right (chunk,p1) -> decoder (dec chunk) p1 | ||
1083 | |||
1084 | {- | ascii and latin encodings only represent a small fragment of 'Text'; thus we cannot | ||
1085 | use the pipes 'Lens' style to work with them. Rather we simply define functions | ||
1086 | each way. | ||
1087 | |||
1088 | 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, | ||
1089 | returning the rest of the 'Text' at the first non-ascii 'Char' | ||
1090 | -} | ||
1091 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | ||
1092 | encodeAscii = go where | ||
1093 | go p = do echunk <- lift (next p) | ||
1094 | case echunk of | ||
1095 | Left r -> return (return r) | ||
1096 | Right (chunk, p') -> | ||
1097 | if T.null chunk | ||
1098 | then go p' | ||
1099 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | ||
1100 | in do yield (B8.pack (T.unpack safe)) | ||
1101 | if T.null unsafe | ||
1102 | then go p' | ||
1103 | else return $ do yield unsafe | ||
1104 | p' | ||
1105 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | ||
1106 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | ||
1107 | -} | ||
1108 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | ||
1109 | encodeIso8859_1 = go where | ||
1110 | go p = do etxt <- lift (next p) | ||
1111 | case etxt of | ||
1112 | Left r -> return (return r) | ||
1113 | Right (txt, p') -> | ||
1114 | if T.null txt | ||
1115 | then go p' | ||
1116 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | ||
1117 | in do yield (B8.pack (T.unpack safe)) | ||
1118 | if T.null unsafe | ||
1119 | then go p' | ||
1120 | else return $ do yield unsafe | ||
1121 | p' | ||
1122 | |||
1123 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | ||
1124 | unused 'ByteString' upon hitting an un-ascii byte. | ||
1125 | -} | ||
1126 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
1127 | decodeAscii = go where | ||
1128 | go p = do echunk <- lift (next p) | ||
1129 | case echunk of | ||
1130 | Left r -> return (return r) | ||
1131 | Right (chunk, p') -> | ||
1132 | if B.null chunk | ||
1133 | then go p' | ||
1134 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | ||
1135 | in do yield (T.pack (B8.unpack safe)) | ||
1136 | if B.null unsafe | ||
1137 | then go p' | ||
1138 | else return $ do yield unsafe | ||
1139 | p' | ||
1140 | |||
1141 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | ||
1142 | unused 'ByteString' upon hitting the rare un-latinizable byte. | ||
1143 | -} | ||
1144 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
1145 | decodeIso8859_1 = go where | ||
1146 | go p = do echunk <- lift (next p) | ||
1147 | case echunk of | ||
1148 | Left r -> return (return r) | ||
1149 | Right (chunk, p') -> | ||
1150 | if B.null chunk | ||
1151 | then go p' | ||
1152 | else let (safe, unsafe) = B.span (<= 0xFF) chunk | ||
1153 | in do yield (T.pack (B8.unpack safe)) | ||
1154 | if B.null unsafe | ||
1155 | then go p' | ||
1156 | else return $ do yield unsafe | ||
1157 | p' | ||
1158 | |||
1159 | |||
1160 | |||
1161 | |||
1162 | |||
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs new file mode 100644 index 0000000..2bb5807 --- /dev/null +++ b/Pipes/Text/Encoding.hs | |||
@@ -0,0 +1,205 @@ | |||
1 | |||
2 | {-# LANGUAGE RankNTypes, BangPatterns #-} | ||
3 | -- | | ||
4 | -- Copyright: 2014 Michael Thompson | ||
5 | -- | ||
6 | -- This module uses the stream decoding functions from the text-stream-decoding package | ||
7 | -- to define pipes decoding functions and lenses. | ||
8 | |||
9 | module Pipes.Text.Encoding | ||
10 | ( DecodeResult (..) | ||
11 | , Codec | ||
12 | , decodeUtf8 | ||
13 | , decodeUtf8Pure | ||
14 | , decodeUtf16LE | ||
15 | , decodeUtf16BE | ||
16 | , decodeUtf32LE | ||
17 | , decodeUtf32BE | ||
18 | , utf8 | ||
19 | , utf8Pure | ||
20 | , utf16LE | ||
21 | , utf16BE | ||
22 | , utf32LE | ||
23 | , utf32BE | ||
24 | , encodeAscii | ||
25 | , decodeAscii | ||
26 | , encodeIso8859_1 | ||
27 | , decodeIso8859_1 | ||
28 | ) | ||
29 | where | ||
30 | |||
31 | import Data.Char (ord) | ||
32 | import Data.ByteString as B | ||
33 | import Data.ByteString (ByteString) | ||
34 | import Data.ByteString.Internal as B | ||
35 | import Data.ByteString.Char8 as B8 | ||
36 | import Data.Text (Text) | ||
37 | import qualified Data.Text as T | ||
38 | import qualified Data.Text.Encoding as TE | ||
39 | import Data.Text.StreamDecoding | ||
40 | import GHC.Word (Word8, Word32) | ||
41 | import Data.Word (Word8, Word16) | ||
42 | import Control.Monad | ||
43 | import Pipes | ||
44 | import Pipes.Core | ||
45 | |||
46 | |||
47 | |||
48 | {- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text. | ||
49 | They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc. | ||
50 | The stream of text they 'see' in a bytestream ends by returning the original byte stream | ||
51 | beginning at the point of failure, or the empty bytestream with its return value. | ||
52 | -} | ||
53 | type Codec = forall f m r . (Functor f , Monad m ) => | ||
54 | (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r))) | ||
55 | -> Producer ByteString m r -> f (Producer ByteString m r ) | ||
56 | |||
57 | decodeStream :: Monad m | ||
58 | => (B.ByteString -> DecodeResult) | ||
59 | -> Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
60 | decodeStream = loop where | ||
61 | loop dec0 p = | ||
62 | do x <- lift (next p) | ||
63 | case x of Left r -> return (return r) | ||
64 | Right (chunk, p') -> case dec0 chunk of | ||
65 | DecodeResultSuccess text dec -> do yield text | ||
66 | loop dec p' | ||
67 | DecodeResultFailure text bs -> do yield text | ||
68 | return (do yield bs | ||
69 | p') | ||
70 | {-# INLINABLE decodeStream#-} | ||
71 | |||
72 | decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
73 | decodeUtf8 = decodeStream streamUtf8 | ||
74 | {-# INLINE decodeUtf8 #-} | ||
75 | |||
76 | decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
77 | decodeUtf8Pure = decodeStream streamUtf8Pure | ||
78 | {-# INLINE decodeUtf8Pure #-} | ||
79 | |||
80 | decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
81 | decodeUtf16LE = decodeStream streamUtf16LE | ||
82 | {-# INLINE decodeUtf16LE #-} | ||
83 | |||
84 | decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
85 | decodeUtf16BE = decodeStream streamUtf16BE | ||
86 | {-# INLINE decodeUtf16BE #-} | ||
87 | |||
88 | decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
89 | decodeUtf32LE = decodeStream streamUtf32LE | ||
90 | {-# INLINE decodeUtf32LE #-} | ||
91 | |||
92 | decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
93 | decodeUtf32BE = decodeStream streamUtf32BE | ||
94 | {-# INLINE decodeUtf32BE #-} | ||
95 | |||
96 | mkCodec :: (forall r m . Monad m => | ||
97 | Producer ByteString m r -> Producer Text m (Producer ByteString m r )) | ||
98 | -> (Text -> ByteString) | ||
99 | -> Codec | ||
100 | mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0)) | ||
101 | |||
102 | |||
103 | {- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated | ||
104 | text stream ends by returning the original bytestream beginning at the point of failure, | ||
105 | or the empty bytestring for a well-encoded text. | ||
106 | -} | ||
107 | |||
108 | utf8 :: Codec | ||
109 | utf8 = mkCodec decodeUtf8 TE.encodeUtf8 | ||
110 | |||
111 | utf8Pure :: Codec | ||
112 | utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8 | ||
113 | |||
114 | utf16LE :: Codec | ||
115 | utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE | ||
116 | |||
117 | utf16BE :: Codec | ||
118 | utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE | ||
119 | |||
120 | utf32LE :: Codec | ||
121 | utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE | ||
122 | |||
123 | utf32BE :: Codec | ||
124 | utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE | ||
125 | |||
126 | |||
127 | {- | ascii and latin encodings only use a small number of the characters 'Text' | ||
128 | recognizes; thus we cannot use the pipes 'Lens' style to work with them. | ||
129 | Rather we simply define functions each way. | ||
130 | |||
131 | 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream, | ||
132 | returning the rest of the 'Text' at the first non-ascii 'Char' | ||
133 | -} | ||
134 | |||
135 | encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | ||
136 | encodeAscii = go where | ||
137 | go p = do e <- lift (next p) | ||
138 | case e of | ||
139 | Left r -> return (return r) | ||
140 | Right (chunk, p') -> | ||
141 | if T.null chunk | ||
142 | then go p' | ||
143 | else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk | ||
144 | in do yield (B8.pack (T.unpack safe)) | ||
145 | if T.null unsafe | ||
146 | then go p' | ||
147 | else return $ do yield unsafe | ||
148 | p' | ||
149 | |||
150 | {- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream, | ||
151 | returning the rest of the 'Text' upon hitting any non-latin 'Char' | ||
152 | -} | ||
153 | encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r) | ||
154 | encodeIso8859_1 = go where | ||
155 | go p = do e <- lift (next p) | ||
156 | case e of | ||
157 | Left r -> return (return r) | ||
158 | Right (txt, p') -> | ||
159 | if T.null txt | ||
160 | then go p' | ||
161 | else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt | ||
162 | in do yield (B8.pack (T.unpack safe)) | ||
163 | if T.null unsafe | ||
164 | then go p' | ||
165 | else return $ do yield unsafe | ||
166 | p' | ||
167 | |||
168 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | ||
169 | unused 'ByteString' upon hitting an un-ascii byte. | ||
170 | -} | ||
171 | decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
172 | decodeAscii = go where | ||
173 | go p = do e <- lift (next p) | ||
174 | case e of | ||
175 | Left r -> return (return r) | ||
176 | Right (chunk, p') -> | ||
177 | if B.null chunk | ||
178 | then go p' | ||
179 | else let (safe, unsafe) = B.span (<= 0x7F) chunk | ||
180 | in do yield (T.pack (B8.unpack safe)) | ||
181 | if B.null unsafe | ||
182 | then go p' | ||
183 | else return (do yield unsafe | ||
184 | p') | ||
185 | |||
186 | {- | Reduce a byte stream to a corresponding stream of ascii chars, returning the | ||
187 | unused 'ByteString' upon hitting the rare un-latinizable byte. | ||
188 | -} | ||
189 | decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r) | ||
190 | decodeIso8859_1 = go where | ||
191 | go p = do e <- lift (next p) | ||
192 | case e of | ||
193 | Left r -> return (return r) | ||
194 | Right (chunk, p') -> | ||
195 | if B.null chunk | ||
196 | then go p' | ||
197 | else do let (safe, unsafe) = B.span (<= 0xFF) chunk | ||
198 | yield (T.pack (B8.unpack safe)) | ||
199 | if B.null unsafe | ||
200 | then go p' | ||
201 | else return (do yield unsafe | ||
202 | p') | ||
203 | |||
204 | |||
205 | |||
diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs new file mode 100644 index 0000000..3c9ac98 --- /dev/null +++ b/Pipes/Text/IO.hs | |||
@@ -0,0 +1,96 @@ | |||
1 | {-#LANGUAGE RankNTypes#-} | ||
2 | |||
3 | module Pipes.Text.IO | ||
4 | ( stdin | ||
5 | , stdout | ||
6 | , fromHandle | ||
7 | , toHandle | ||
8 | , readFile | ||
9 | , writeFile | ||
10 | ) where | ||
11 | |||
12 | import qualified System.IO as IO | ||
13 | import Control.Exception (throwIO, try) | ||
14 | import Foreign.C.Error (Errno(Errno), ePIPE) | ||
15 | import qualified GHC.IO.Exception as G | ||
16 | import Data.Text (Text) | ||
17 | import qualified Data.Text as T | ||
18 | import qualified Data.Text.IO as T | ||
19 | import Pipes | ||
20 | import qualified Pipes.Safe.Prelude as Safe | ||
21 | import qualified Pipes.Safe as Safe | ||
22 | import Pipes.Safe (MonadSafe(..), Base(..)) | ||
23 | import Prelude hiding (readFile, writeFile) | ||
24 | |||
25 | -- | Stream text from 'stdin' | ||
26 | stdin :: MonadIO m => Producer Text m () | ||
27 | stdin = fromHandle IO.stdin | ||
28 | {-# INLINE stdin #-} | ||
29 | |||
30 | {-| Convert a 'IO.Handle' into a text stream using a text size | ||
31 | determined by the good sense of the text library; note that this | ||
32 | is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@ | ||
33 | but uses the system encoding and has other `Data.Text.IO` features | ||
34 | -} | ||
35 | |||
36 | fromHandle :: MonadIO m => IO.Handle -> Producer Text m () | ||
37 | fromHandle h = go where | ||
38 | go = do txt <- liftIO (T.hGetChunk h) | ||
39 | if T.null txt then return () | ||
40 | else do yield txt | ||
41 | go | ||
42 | {-# INLINABLE fromHandle#-} | ||
43 | |||
44 | |||
45 | {-| Stream text from a file in the simple fashion of @Data.Text.IO@ | ||
46 | |||
47 | >>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout | ||
48 | MAIN = PUTSTRLN "HELLO WORLD" | ||
49 | -} | ||
50 | |||
51 | readFile :: MonadSafe m => FilePath -> Producer Text m () | ||
52 | readFile file = Safe.withFile file IO.ReadMode fromHandle | ||
53 | {-# INLINE readFile #-} | ||
54 | |||
55 | |||
56 | {-| Stream text to 'stdout' | ||
57 | |||
58 | Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe. | ||
59 | |||
60 | Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@ | ||
61 | instead of @(source >-> stdout)@ . | ||
62 | -} | ||
63 | stdout :: MonadIO m => Consumer' Text m () | ||
64 | stdout = go | ||
65 | where | ||
66 | go = do | ||
67 | txt <- await | ||
68 | x <- liftIO $ try (T.putStr txt) | ||
69 | case x of | ||
70 | Left (G.IOError { G.ioe_type = G.ResourceVanished | ||
71 | , G.ioe_errno = Just ioe }) | ||
72 | | Errno ioe == ePIPE | ||
73 | -> return () | ||
74 | Left e -> liftIO (throwIO e) | ||
75 | Right () -> go | ||
76 | {-# INLINABLE stdout #-} | ||
77 | |||
78 | |||
79 | {-| Convert a text stream into a 'Handle' | ||
80 | |||
81 | Note: again, for best performance, where possible use | ||
82 | @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@. | ||
83 | -} | ||
84 | toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r | ||
85 | toHandle h = for cat (liftIO . T.hPutStr h) | ||
86 | {-# INLINABLE toHandle #-} | ||
87 | |||
88 | {-# RULES "p >-> toHandle h" forall p h . | ||
89 | p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt)) | ||
90 | #-} | ||
91 | |||
92 | |||
93 | -- | Stream text into a file. Uses @pipes-safe@. | ||
94 | writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m () | ||
95 | writeFile file = Safe.withFile file IO.WriteMode toHandle | ||
96 | {-# INLINE writeFile #-} | ||
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs deleted file mode 100644 index 582ef14..0000000 --- a/Pipes/Text/Internal.hs +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | module Pipes.Text.Internal | ||
2 | (module Pipes.Text.Internal.Codec | ||
3 | , module Pipes.Text.Internal.Decoding | ||
4 | ) where | ||
5 | |||
6 | import Pipes.Text.Internal.Codec | ||
7 | import Pipes.Text.Internal.Decoding \ No newline at end of file | ||
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs deleted file mode 100644 index 075a152..0000000 --- a/Pipes/Text/Internal/Codec.hs +++ /dev/null | |||
@@ -1,216 +0,0 @@ | |||
1 | |||
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} | ||
3 | -- | | ||
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | ||
5 | -- License: MIT | ||
6 | -- This Parts of this code were taken from enumerator and conduits, and adapted for pipes | ||
7 | |||
8 | -- This module follows the model of the enumerator and conduits libraries, and defines | ||
9 | -- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and | ||
10 | -- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so | ||
11 | -- special functions appear in @Pipes.Text@ | ||
12 | |||
13 | |||
14 | module Pipes.Text.Internal.Codec | ||
15 | ( Codec(..) | ||
16 | , TextException(..) | ||
17 | , utf8 | ||
18 | , utf16_le | ||
19 | , utf16_be | ||
20 | , utf32_le | ||
21 | , utf32_be | ||
22 | ) where | ||
23 | |||
24 | import Data.Bits ((.&.)) | ||
25 | import Data.Char (ord) | ||
26 | import Data.ByteString as B | ||
27 | import Data.ByteString (ByteString) | ||
28 | import Data.ByteString.Internal as B | ||
29 | import Data.ByteString.Char8 as B8 | ||
30 | import Data.Text (Text) | ||
31 | import qualified Data.Text as T | ||
32 | import qualified Data.Text.Encoding as TE | ||
33 | import Data.Text.Encoding.Error () | ||
34 | import GHC.Word (Word8, Word32) | ||
35 | import qualified Data.Text.Array as A | ||
36 | import Data.Word (Word8, Word16) | ||
37 | import System.IO.Unsafe (unsafePerformIO) | ||
38 | import qualified Control.Exception as Exc | ||
39 | import Data.Bits ((.&.), (.|.), shiftL) | ||
40 | import Data.Typeable | ||
41 | import Control.Arrow (first) | ||
42 | import Data.Maybe (catMaybes) | ||
43 | import Pipes.Text.Internal.Decoding | ||
44 | import Pipes | ||
45 | -- | A specific character encoding. | ||
46 | |||
47 | data Codec = Codec | ||
48 | { codecName :: Text | ||
49 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | ||
50 | , codecDecode :: ByteString -> Decoding | ||
51 | } | ||
52 | |||
53 | instance Show Codec where | ||
54 | showsPrec d c = showParen (d > 10) $ | ||
55 | showString "Codec " . shows (codecName c) | ||
56 | |||
57 | data TextException = DecodeException Codec Word8 | ||
58 | | EncodeException Codec Char | ||
59 | | LengthExceeded Int | ||
60 | | TextException Exc.SomeException | ||
61 | deriving (Show, Typeable) | ||
62 | instance Exc.Exception TextException | ||
63 | |||
64 | |||
65 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | ||
66 | -> (ByteString -> Decoding) | ||
67 | toDecoding op = loop B.empty where | ||
68 | loop !extra bs0 = case op (B.append extra bs0) of | ||
69 | (txt, Right bs) -> Some txt bs (loop bs) | ||
70 | (txt, Left (_,bs)) -> Other txt bs | ||
71 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | ||
72 | -- or else Codec changed to the conduit/enumerator definition. We have | ||
73 | -- altered it to use 'streamDecodeUtf8' | ||
74 | |||
75 | splitSlowly :: (ByteString -> Text) | ||
76 | -> ByteString | ||
77 | -> (Text, Either (TextException, ByteString) ByteString) | ||
78 | splitSlowly dec bytes = valid where | ||
79 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | ||
80 | splits 0 = [(B.empty, bytes)] | ||
81 | splits n = B.splitAt n bytes : splits (n - 1) | ||
82 | decFirst (a, b) = case tryEvaluate (dec a) of | ||
83 | Left _ -> Nothing | ||
84 | Right text -> let trouble = case tryEvaluate (dec b) of | ||
85 | Left exc -> Left (TextException exc, b) | ||
86 | Right _ -> Right B.empty | ||
87 | in Just (text, trouble) -- this case shouldn't occur, | ||
88 | -- since splitSlowly is only called | ||
89 | -- when parsing failed somewhere | ||
90 | |||
91 | utf8 :: Codec | ||
92 | utf8 = Codec name enc (toDecoding dec) where | ||
93 | name = T.pack "UTF-8" | ||
94 | enc text = (TE.encodeUtf8 text, Nothing) | ||
95 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) | ||
96 | |||
97 | -- -- Whether the given byte is a continuation byte. | ||
98 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | ||
99 | -- | ||
100 | -- -- The number of continuation bytes needed by the given | ||
101 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | ||
102 | -- -- non-continuation byte and the whole split quickly must fail so | ||
103 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | ||
104 | -- -- suitable error. | ||
105 | -- required x0 | ||
106 | -- | x0 .&. 0x80 == 0x00 = 0 | ||
107 | -- | x0 .&. 0xE0 == 0xC0 = 1 | ||
108 | -- | x0 .&. 0xF0 == 0xE0 = 2 | ||
109 | -- | x0 .&. 0xF8 == 0xF0 = 3 | ||
110 | -- | otherwise = -1 | ||
111 | -- | ||
112 | -- splitQuickly bytes | ||
113 | -- | B.null l || req == -1 = Nothing | ||
114 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | ||
115 | -- | otherwise = Just (TE.decodeUtf8 l', r') | ||
116 | -- where | ||
117 | -- (l, r) = B.spanEnd isContinuation bytes | ||
118 | -- req = required (B.last l) | ||
119 | -- l' = B.init l | ||
120 | -- r' = B.cons (B.last l) r | ||
121 | |||
122 | |||
123 | utf16_le :: Codec | ||
124 | utf16_le = Codec name enc (toDecoding dec) where | ||
125 | name = T.pack "UTF-16-LE" | ||
126 | enc text = (TE.encodeUtf16LE text, Nothing) | ||
127 | dec bytes = case splitQuickly bytes of | ||
128 | Just (text, extra) -> (text, Right extra) | ||
129 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | ||
130 | |||
131 | splitQuickly bytes = maybeDecode (loop 0) where | ||
132 | maxN = B.length bytes | ||
133 | |||
134 | loop n | n == maxN = decodeAll | ||
135 | | (n + 1) == maxN = decodeTo n | ||
136 | loop n = let | ||
137 | req = utf16Required | ||
138 | (B.index bytes n) | ||
139 | (B.index bytes (n + 1)) | ||
140 | decodeMore = loop $! n + req | ||
141 | in if n + req > maxN | ||
142 | then decodeTo n | ||
143 | else decodeMore | ||
144 | |||
145 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | ||
146 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | ||
147 | |||
148 | utf16_be :: Codec | ||
149 | utf16_be = Codec name enc (toDecoding dec) where | ||
150 | name = T.pack "UTF-16-BE" | ||
151 | enc text = (TE.encodeUtf16BE text, Nothing) | ||
152 | dec bytes = case splitQuickly bytes of | ||
153 | Just (text, extra) -> (text, Right extra) | ||
154 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | ||
155 | |||
156 | splitQuickly bytes = maybeDecode (loop 0) where | ||
157 | maxN = B.length bytes | ||
158 | |||
159 | loop n | n == maxN = decodeAll | ||
160 | | (n + 1) == maxN = decodeTo n | ||
161 | loop n = let | ||
162 | req = utf16Required | ||
163 | (B.index bytes (n + 1)) | ||
164 | (B.index bytes n) | ||
165 | decodeMore = loop $! n + req | ||
166 | in if n + req > maxN | ||
167 | then decodeTo n | ||
168 | else decodeMore | ||
169 | |||
170 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | ||
171 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | ||
172 | |||
173 | utf16Required :: Word8 -> Word8 -> Int | ||
174 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | ||
175 | x :: Word16 | ||
176 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | ||
177 | |||
178 | |||
179 | utf32_le :: Codec | ||
180 | utf32_le = Codec name enc (toDecoding dec) where | ||
181 | name = T.pack "UTF-32-LE" | ||
182 | enc text = (TE.encodeUtf32LE text, Nothing) | ||
183 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | ||
184 | Just (text, extra) -> (text, Right extra) | ||
185 | Nothing -> splitSlowly TE.decodeUtf32LE bs | ||
186 | |||
187 | |||
188 | utf32_be :: Codec | ||
189 | utf32_be = Codec name enc (toDecoding dec) where | ||
190 | name = T.pack "UTF-32-BE" | ||
191 | enc text = (TE.encodeUtf32BE text, Nothing) | ||
192 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | ||
193 | Just (text, extra) -> (text, Right extra) | ||
194 | Nothing -> splitSlowly TE.decodeUtf32BE bs | ||
195 | |||
196 | utf32SplitBytes :: (ByteString -> Text) | ||
197 | -> ByteString | ||
198 | -> Maybe (Text, ByteString) | ||
199 | utf32SplitBytes dec bytes = split where | ||
200 | split = maybeDecode (dec toDecode, extra) | ||
201 | len = B.length bytes | ||
202 | lenExtra = mod len 4 | ||
203 | |||
204 | lenToDecode = len - lenExtra | ||
205 | (toDecode, extra) = if lenExtra == 0 | ||
206 | then (bytes, B.empty) | ||
207 | else B.splitAt lenToDecode bytes | ||
208 | |||
209 | |||
210 | tryEvaluate :: a -> Either Exc.SomeException a | ||
211 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | ||
212 | |||
213 | maybeDecode :: (a, b) -> Maybe (a, b) | ||
214 | maybeDecode (a, b) = case tryEvaluate a of | ||
215 | Left _ -> Nothing | ||
216 | Right _ -> Just (a, b) | ||
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs deleted file mode 100644 index b5d928a..0000000 --- a/Pipes/Text/Internal/Decoding.hs +++ /dev/null | |||
@@ -1,154 +0,0 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} | ||
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | ||
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
5 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
6 | -- especially @Data.Text.Encoding@ in order to define a pipes-appropriate | ||
7 | -- 'streamDecodeUtf8' | ||
8 | |||
9 | module Pipes.Text.Internal.Decoding | ||
10 | ( Decoding(..) | ||
11 | , streamDecodeUtf8 | ||
12 | , decodeSomeUtf8 | ||
13 | ) where | ||
14 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
15 | import Control.Monad.ST (ST, runST) | ||
16 | import Data.Bits ((.&.)) | ||
17 | import Data.ByteString as B | ||
18 | import Data.ByteString (ByteString) | ||
19 | import Data.ByteString.Internal as B | ||
20 | import Data.ByteString.Char8 as B8 | ||
21 | import Data.Text (Text) | ||
22 | import qualified Data.Text as T | ||
23 | import qualified Data.Text.Encoding as TE | ||
24 | import Data.Text.Encoding.Error () | ||
25 | import Data.Text.Internal (Text, textP) | ||
26 | import Foreign.C.Types (CSize) | ||
27 | import Foreign.ForeignPtr (withForeignPtr) | ||
28 | import Foreign.Marshal.Utils (with) | ||
29 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
30 | import Foreign.Storable (Storable, peek, poke) | ||
31 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
32 | import GHC.Word (Word8, Word32) | ||
33 | import qualified Data.Text.Array as A | ||
34 | import Data.Word (Word8, Word16) | ||
35 | import System.IO.Unsafe (unsafePerformIO) | ||
36 | import qualified Control.Exception as Exc | ||
37 | import Data.Bits ((.&.), (.|.), shiftL) | ||
38 | import Data.Typeable | ||
39 | import Control.Arrow (first) | ||
40 | import Data.Maybe (catMaybes) | ||
41 | #include "pipes_text_cbits.h" | ||
42 | |||
43 | |||
44 | |||
45 | -- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding | ||
46 | |||
47 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
48 | -- Text, continuation and any undecoded fragment. | ||
49 | | Other Text ByteString | ||
50 | -- Text followed by an undecodable ByteString | ||
51 | |||
52 | instance Show Decoding where | ||
53 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
54 | showString "Some " . showsPrec prec' t . | ||
55 | showChar ' ' . showsPrec prec' bs . | ||
56 | showString " _" | ||
57 | where prec = 10; prec' = prec + 1 | ||
58 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
59 | showString "Other " . showsPrec prec' t . | ||
60 | showChar ' ' . showsPrec prec' bs . | ||
61 | showString " _" | ||
62 | where prec = 10; prec' = prec + 1 | ||
63 | |||
64 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
65 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
66 | |||
67 | -- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. | ||
68 | streamDecodeUtf8 :: ByteString -> Decoding | ||
69 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
70 | where | ||
71 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
72 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
73 | runST $ do marray <- A.new (len+1) | ||
74 | unsafeIOToST (decodeChunkToBuffer marray) | ||
75 | where | ||
76 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
77 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
78 | with (0::CSize) $ \destOffPtr -> | ||
79 | with codepoint0 $ \codepointPtr -> | ||
80 | with state0 $ \statePtr -> | ||
81 | with nullPtr $ \curPtrPtr -> | ||
82 | do let end = ptr `plusPtr` (off + len) | ||
83 | curPtr = ptr `plusPtr` off | ||
84 | poke curPtrPtr curPtr | ||
85 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
86 | state <- peek statePtr | ||
87 | lastPtr <- peek curPtrPtr | ||
88 | codepoint <- peek codepointPtr | ||
89 | n <- peek destOffPtr | ||
90 | chunkText <- mkText dest n | ||
91 | let left = lastPtr `minusPtr` curPtr | ||
92 | remaining = B.drop left bs | ||
93 | accum = if T.null chunkText then B.append old remaining else remaining | ||
94 | return $! case state of | ||
95 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
96 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
97 | {-# INLINE decodeChunkToBuffer #-} | ||
98 | {-# INLINE decodeChunkUtf8 #-} | ||
99 | {-# INLINE streamDecodeUtf8 #-} | ||
100 | |||
101 | -- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble | ||
102 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
103 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
104 | dest <- A.new (len+1) | ||
105 | unsafeIOToST $ | ||
106 | withForeignPtr fp $ \ptr -> | ||
107 | with (0::CSize) $ \destOffPtr -> | ||
108 | with (0::CodePoint) $ \codepointPtr -> | ||
109 | with (0::DecoderState) $ \statePtr -> | ||
110 | with nullPtr $ \curPtrPtr -> | ||
111 | do let end = ptr `plusPtr` (off + len) | ||
112 | curPtr = ptr `plusPtr` off | ||
113 | poke curPtrPtr curPtr | ||
114 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
115 | curPtrPtr end codepointPtr statePtr | ||
116 | state <- peek statePtr | ||
117 | lastPtr <- peek curPtrPtr | ||
118 | codepoint <- peek codepointPtr | ||
119 | n <- peek destOffPtr | ||
120 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
121 | return $! textP arr 0 (fromIntegral n) | ||
122 | let left = lastPtr `minusPtr` curPtr | ||
123 | remaining = B.drop left bs | ||
124 | return $! (chunkText, remaining) | ||
125 | {-# INLINE decodeSomeUtf8 #-} | ||
126 | |||
127 | mkText :: A.MArray s -> CSize -> IO Text | ||
128 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
129 | return $! textP arr 0 (fromIntegral n) | ||
130 | {-# INLINE mkText #-} | ||
131 | |||
132 | ord :: Char -> Int | ||
133 | ord (C# c#) = I# (ord# c#) | ||
134 | {-# INLINE ord #-} | ||
135 | |||
136 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
137 | unsafeWrite marr i c | ||
138 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
139 | return 1 | ||
140 | | otherwise = do A.unsafeWrite marr i lo | ||
141 | A.unsafeWrite marr (i+1) hi | ||
142 | return 2 | ||
143 | where n = ord c | ||
144 | m = n - 0x10000 | ||
145 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
146 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
147 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
148 | {-# INLINE shiftR #-} | ||
149 | {-# INLINE unsafeWrite #-} | ||
150 | |||
151 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
152 | :: MutableByteArray# s -> Ptr CSize | ||
153 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
154 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||
diff --git a/cbits/cbits.c b/cbits/cbits.c deleted file mode 100644 index c11645b..0000000 --- a/cbits/cbits.c +++ /dev/null | |||
@@ -1,168 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | * | ||
4 | * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>. | ||
5 | * | ||
6 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | ||
7 | */ | ||
8 | |||
9 | #include <string.h> | ||
10 | #include <stdint.h> | ||
11 | #include <stdio.h> | ||
12 | #include "pipes_text_cbits.h" | ||
13 | |||
14 | |||
15 | |||
16 | #define UTF8_ACCEPT 0 | ||
17 | #define UTF8_REJECT 12 | ||
18 | |||
19 | static const uint8_t utf8d[] = { | ||
20 | /* | ||
21 | * The first part of the table maps bytes to character classes that | ||
22 | * to reduce the size of the transition table and create bitmasks. | ||
23 | */ | ||
24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
26 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
27 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
28 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | ||
29 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | ||
30 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | ||
31 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | ||
32 | |||
33 | /* | ||
34 | * The second part is a transition table that maps a combination of | ||
35 | * a state of the automaton and a character class to a state. | ||
36 | */ | ||
37 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | ||
38 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | ||
39 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | ||
40 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | ||
41 | 12,36,12,12,12,12,12,12,12,12,12,12, | ||
42 | }; | ||
43 | |||
44 | static inline uint32_t | ||
45 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) { | ||
46 | uint32_t type = utf8d[byte]; | ||
47 | |||
48 | *codep = (*state != UTF8_ACCEPT) ? | ||
49 | (byte & 0x3fu) | (*codep << 6) : | ||
50 | (0xff >> type) & (byte); | ||
51 | |||
52 | return *state = utf8d[256 + *state + type]; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * A best-effort decoder. Runs until it hits either end of input or | ||
57 | * the start of an invalid byte sequence. | ||
58 | * | ||
59 | * At exit, we update *destoff with the next offset to write to, *src | ||
60 | * with the next source location past the last one successfully | ||
61 | * decoded, and return the next source location to read from. | ||
62 | * | ||
63 | * Moreover, we expose the internal decoder state (state0 and | ||
64 | * codepoint0), allowing one to restart the decoder after it | ||
65 | * terminates (say, due to a partial codepoint). | ||
66 | * | ||
67 | * In particular, there are a few possible outcomes, | ||
68 | * | ||
69 | * 1) We decoded the buffer entirely: | ||
70 | * In this case we return srcend | ||
71 | * state0 == UTF8_ACCEPT | ||
72 | * | ||
73 | * 2) We met an invalid encoding | ||
74 | * In this case we return the address of the first invalid byte | ||
75 | * state0 == UTF8_REJECT | ||
76 | * | ||
77 | * 3) We reached the end of the buffer while decoding a codepoint | ||
78 | * In this case we return a pointer to the first byte of the partial codepoint | ||
79 | * state0 != UTF8_ACCEPT, UTF8_REJECT | ||
80 | * | ||
81 | */ | ||
82 | |||
83 | #if defined(__GNUC__) || defined(__clang__) | ||
84 | static inline uint8_t const * | ||
85 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
86 | const uint8_t const **src, const uint8_t const *srcend, | ||
87 | uint32_t *codepoint0, uint32_t *state0) | ||
88 | __attribute((always_inline)); | ||
89 | #endif | ||
90 | |||
91 | static inline uint8_t const * | ||
92 | _hs_pipes_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, | ||
93 | const uint8_t const **src, const uint8_t const *srcend, | ||
94 | uint32_t *codepoint0, uint32_t *state0) | ||
95 | { | ||
96 | uint16_t *d = dest + *destoff; | ||
97 | const uint8_t *s = *src, *last = *src; | ||
98 | uint32_t state = *state0; | ||
99 | uint32_t codepoint = *codepoint0; | ||
100 | |||
101 | while (s < srcend) { | ||
102 | #if defined(__i386__) || defined(__x86_64__) | ||
103 | /* | ||
104 | * This code will only work on a little-endian system that | ||
105 | * supports unaligned loads. | ||
106 | * | ||
107 | * It gives a substantial speed win on data that is purely or | ||
108 | * partly ASCII (e.g. HTML), at only a slight cost on purely | ||
109 | * non-ASCII text. | ||
110 | */ | ||
111 | |||
112 | if (state == UTF8_ACCEPT) { | ||
113 | while (s < srcend - 4) { | ||
114 | codepoint = *((uint32_t *) s); | ||
115 | if ((codepoint & 0x80808080) != 0) | ||
116 | break; | ||
117 | s += 4; | ||
118 | |||
119 | /* | ||
120 | * Tried 32-bit stores here, but the extra bit-twiddling | ||
121 | * slowed the code down. | ||
122 | */ | ||
123 | |||
124 | *d++ = (uint16_t) (codepoint & 0xff); | ||
125 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); | ||
126 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); | ||
127 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); | ||
128 | } | ||
129 | last = s; | ||
130 | } | ||
131 | #endif | ||
132 | |||
133 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { | ||
134 | if (state != UTF8_REJECT) | ||
135 | continue; | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | if (codepoint <= 0xffff) | ||
140 | *d++ = (uint16_t) codepoint; | ||
141 | else { | ||
142 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); | ||
143 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); | ||
144 | } | ||
145 | last = s; | ||
146 | } | ||
147 | |||
148 | *destoff = d - dest; | ||
149 | *codepoint0 = codepoint; | ||
150 | *state0 = state; | ||
151 | *src = last; | ||
152 | |||
153 | return s; | ||
154 | } | ||
155 | |||
156 | uint8_t const * | ||
157 | _hs_pipes_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, | ||
158 | const uint8_t const **src, | ||
159 | const uint8_t const *srcend, | ||
160 | uint32_t *codepoint0, uint32_t *state0) | ||
161 | { | ||
162 | uint8_t const *ret = _hs_pipes_text_decode_utf8_int(dest, destoff, src, srcend, | ||
163 | codepoint0, state0); | ||
164 | if (*state0 == UTF8_REJECT) | ||
165 | ret -=1; | ||
166 | return ret; | ||
167 | } | ||
168 | |||
diff --git a/include/pipes_text_cbits.h b/include/pipes_text_cbits.h deleted file mode 100644 index b9ab670..0000000 --- a/include/pipes_text_cbits.h +++ /dev/null | |||
@@ -1,11 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>. | ||
3 | */ | ||
4 | |||
5 | #ifndef _pipes_text_cbits_h | ||
6 | #define _pipes_text_cbits_h | ||
7 | |||
8 | #define UTF8_ACCEPT 0 | ||
9 | #define UTF8_REJECT 12 | ||
10 | |||
11 | #endif | ||
diff --git a/pipes-text.cabal b/pipes-text.cabal index a1d57bb..017d41c 100644 --- a/pipes-text.cabal +++ b/pipes-text.cabal | |||
@@ -1,5 +1,5 @@ | |||
1 | name: pipes-text | 1 | name: pipes-text |
2 | version: 0.0.0.6 | 2 | version: 0.0.0.7 |
3 | synopsis: Text pipes. | 3 | synopsis: Text pipes. |
4 | description: Many of the pipes and other operations defined here mirror those in | 4 | description: Many of the pipes and other operations defined here mirror those in |
5 | the `pipes-bytestring` library. Folds like `length` and grouping | 5 | the `pipes-bytestring` library. Folds like `length` and grouping |
@@ -36,23 +36,28 @@ source-repository head | |||
36 | type: git | 36 | type: git |
37 | location: https://github.com/michaelt/text-pipes | 37 | location: https://github.com/michaelt/text-pipes |
38 | 38 | ||
39 | flag noio | ||
40 | default: False | ||
41 | Description: Use a version of text earlier than 0.11.3 | ||
39 | 42 | ||
40 | library | 43 | library |
41 | c-sources: cbits/cbits.c | 44 | exposed-modules: Pipes.Text, Pipes.Text.Encoding |
42 | include-dirs: include | 45 | build-depends: base >= 4 && < 5 , |
43 | exposed-modules: Pipes.Text, Pipes.Text.Internal | 46 | bytestring >= 0.9 , |
44 | other-modules: Pipes.Text.Internal.Decoding, Pipes.Text.Internal.Codec | 47 | text >=0.11.3 && < 1.2, |
45 | other-extensions: RankNTypes | 48 | text-stream-decode >= 0.1 && < 0.2, |
46 | build-depends: base >= 4 && < 5 , | 49 | profunctors >= 3.1.1 && < 4.1, |
47 | bytestring >=0.10 && < 0.11, | 50 | pipes >=4.0 && < 4.2, |
48 | text >=0.11.3 && < 1.2, | 51 | pipes-group >= 1.0.0 && < 1.1, |
49 | profunctors >= 3.1.1 && < 4.1 , | 52 | pipes-parse >=2.0 && < 3.1, |
50 | pipes >=4.0 && < 4.2, | ||
51 | pipes-group >= 1.0.0 && < 1.1 , | ||
52 | pipes-parse >=2.0 && < 3.1, | ||
53 | pipes-safe, | 53 | pipes-safe, |
54 | pipes-bytestring >= 1.0 && < 2.1, | 54 | pipes-bytestring >= 1.0 && < 2.1, |
55 | transformers >= 0.2.0.0 && < 0.4 | 55 | transformers >= 0.2.0.0 && < 0.4 |
56 | -- hs-source-dirs: | 56 | other-extensions: RankNTypes |
57 | default-language: Haskell2010 | 57 | default-language: Haskell2010 |
58 | ghc-options: -O2 | 58 | ghc-options: -O2 |
59 | |||
60 | if !flag(noio) | ||
61 | exposed-modules: Pipes.Text.IO | ||
62 | build-depends: text >=0.11.3 && < 1.2 | ||
63 | |||