aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2014-02-15 10:01:48 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2014-02-15 10:01:48 -0500
commitbbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree5765f3f224708890cf50b400c675c23e759595a2 /Pipes/Text
parentfd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
downloadtext-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip
use new text-stream-decoding
Diffstat (limited to 'Pipes/Text')
-rw-r--r--Pipes/Text/Encoding.hs205
-rw-r--r--Pipes/Text/IO.hs96
-rw-r--r--Pipes/Text/Internal.hs7
-rw-r--r--Pipes/Text/Internal/Codec.hs216
-rw-r--r--Pipes/Text/Internal/Decoding.hs154
5 files changed, 301 insertions, 377 deletions
diff --git a/Pipes/Text/Encoding.hs b/Pipes/Text/Encoding.hs
new file mode 100644
index 0000000..2bb5807
--- /dev/null
+++ b/Pipes/Text/Encoding.hs
@@ -0,0 +1,205 @@
1
2{-# LANGUAGE RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson
5--
6-- This module uses the stream decoding functions from the text-stream-decoding package
7-- to define pipes decoding functions and lenses.
8
9module Pipes.Text.Encoding
10 ( DecodeResult (..)
11 , Codec
12 , decodeUtf8
13 , decodeUtf8Pure
14 , decodeUtf16LE
15 , decodeUtf16BE
16 , decodeUtf32LE
17 , decodeUtf32BE
18 , utf8
19 , utf8Pure
20 , utf16LE
21 , utf16BE
22 , utf32LE
23 , utf32BE
24 , encodeAscii
25 , decodeAscii
26 , encodeIso8859_1
27 , decodeIso8859_1
28 )
29 where
30
31import Data.Char (ord)
32import Data.ByteString as B
33import Data.ByteString (ByteString)
34import Data.ByteString.Internal as B
35import Data.ByteString.Char8 as B8
36import Data.Text (Text)
37import qualified Data.Text as T
38import qualified Data.Text.Encoding as TE
39import Data.Text.StreamDecoding
40import GHC.Word (Word8, Word32)
41import Data.Word (Word8, Word16)
42import Control.Monad
43import Pipes
44import Pipes.Core
45
46
47
48{- | A 'Codec' is just an improper lens into a byte stream that is expected to contain text.
49 They are named in accordance with the expected encoding, 'utf8', 'utf16LE' etc.
50 The stream of text they 'see' in a bytestream ends by returning the original byte stream
51 beginning at the point of failure, or the empty bytestream with its return value.
52 -}
53type Codec = forall f m r . (Functor f , Monad m ) =>
54 (Producer Text m (Producer ByteString m r) -> f (Producer Text m (Producer ByteString m r)))
55 -> Producer ByteString m r -> f (Producer ByteString m r )
56
57decodeStream :: Monad m
58 => (B.ByteString -> DecodeResult)
59 -> Producer ByteString m r -> Producer Text m (Producer ByteString m r)
60decodeStream = loop where
61 loop dec0 p =
62 do x <- lift (next p)
63 case x of Left r -> return (return r)
64 Right (chunk, p') -> case dec0 chunk of
65 DecodeResultSuccess text dec -> do yield text
66 loop dec p'
67 DecodeResultFailure text bs -> do yield text
68 return (do yield bs
69 p')
70{-# INLINABLE decodeStream#-}
71
72decodeUtf8 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
73decodeUtf8 = decodeStream streamUtf8
74{-# INLINE decodeUtf8 #-}
75
76decodeUtf8Pure :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
77decodeUtf8Pure = decodeStream streamUtf8Pure
78{-# INLINE decodeUtf8Pure #-}
79
80decodeUtf16LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
81decodeUtf16LE = decodeStream streamUtf16LE
82{-# INLINE decodeUtf16LE #-}
83
84decodeUtf16BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
85decodeUtf16BE = decodeStream streamUtf16BE
86{-# INLINE decodeUtf16BE #-}
87
88decodeUtf32LE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
89decodeUtf32LE = decodeStream streamUtf32LE
90{-# INLINE decodeUtf32LE #-}
91
92decodeUtf32BE :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
93decodeUtf32BE = decodeStream streamUtf32BE
94{-# INLINE decodeUtf32BE #-}
95
96mkCodec :: (forall r m . Monad m =>
97 Producer ByteString m r -> Producer Text m (Producer ByteString m r ))
98 -> (Text -> ByteString)
99 -> Codec
100mkCodec dec enc = \k p0 -> fmap (\p -> join (for p (yield . enc))) (k (dec p0))
101
102
103{- | An improper lens into a byte stream expected to be UTF-8 encoded; the associated
104 text stream ends by returning the original bytestream beginning at the point of failure,
105 or the empty bytestring for a well-encoded text.
106 -}
107
108utf8 :: Codec
109utf8 = mkCodec decodeUtf8 TE.encodeUtf8
110
111utf8Pure :: Codec
112utf8Pure = mkCodec decodeUtf8Pure TE.encodeUtf8
113
114utf16LE :: Codec
115utf16LE = mkCodec decodeUtf16LE TE.encodeUtf16LE
116
117utf16BE :: Codec
118utf16BE = mkCodec decodeUtf16BE TE.encodeUtf16BE
119
120utf32LE :: Codec
121utf32LE = mkCodec decodeUtf32LE TE.encodeUtf32LE
122
123utf32BE :: Codec
124utf32BE = mkCodec decodeUtf32BE TE.encodeUtf32BE
125
126
127{- | ascii and latin encodings only use a small number of the characters 'Text'
128 recognizes; thus we cannot use the pipes 'Lens' style to work with them.
129 Rather we simply define functions each way.
130
131 'encodeAscii' : Reduce as much of your stream of 'Text' actually is ascii to a byte stream,
132 returning the rest of the 'Text' at the first non-ascii 'Char'
133-}
134
135encodeAscii :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
136encodeAscii = go where
137 go p = do e <- lift (next p)
138 case e of
139 Left r -> return (return r)
140 Right (chunk, p') ->
141 if T.null chunk
142 then go p'
143 else let (safe, unsafe) = T.span (\c -> ord c <= 0x7F) chunk
144 in do yield (B8.pack (T.unpack safe))
145 if T.null unsafe
146 then go p'
147 else return $ do yield unsafe
148 p'
149
150{- | Reduce as much of your stream of 'Text' actually is iso8859 or latin1 to a byte stream,
151 returning the rest of the 'Text' upon hitting any non-latin 'Char'
152 -}
153encodeIso8859_1 :: Monad m => Producer Text m r -> Producer ByteString m (Producer Text m r)
154encodeIso8859_1 = go where
155 go p = do e <- lift (next p)
156 case e of
157 Left r -> return (return r)
158 Right (txt, p') ->
159 if T.null txt
160 then go p'
161 else let (safe, unsafe) = T.span (\c -> ord c <= 0xFF) txt
162 in do yield (B8.pack (T.unpack safe))
163 if T.null unsafe
164 then go p'
165 else return $ do yield unsafe
166 p'
167
168{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
169 unused 'ByteString' upon hitting an un-ascii byte.
170 -}
171decodeAscii :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
172decodeAscii = go where
173 go p = do e <- lift (next p)
174 case e of
175 Left r -> return (return r)
176 Right (chunk, p') ->
177 if B.null chunk
178 then go p'
179 else let (safe, unsafe) = B.span (<= 0x7F) chunk
180 in do yield (T.pack (B8.unpack safe))
181 if B.null unsafe
182 then go p'
183 else return (do yield unsafe
184 p')
185
186{- | Reduce a byte stream to a corresponding stream of ascii chars, returning the
187 unused 'ByteString' upon hitting the rare un-latinizable byte.
188 -}
189decodeIso8859_1 :: Monad m => Producer ByteString m r -> Producer Text m (Producer ByteString m r)
190decodeIso8859_1 = go where
191 go p = do e <- lift (next p)
192 case e of
193 Left r -> return (return r)
194 Right (chunk, p') ->
195 if B.null chunk
196 then go p'
197 else do let (safe, unsafe) = B.span (<= 0xFF) chunk
198 yield (T.pack (B8.unpack safe))
199 if B.null unsafe
200 then go p'
201 else return (do yield unsafe
202 p')
203
204
205
diff --git a/Pipes/Text/IO.hs b/Pipes/Text/IO.hs
new file mode 100644
index 0000000..3c9ac98
--- /dev/null
+++ b/Pipes/Text/IO.hs
@@ -0,0 +1,96 @@
1{-#LANGUAGE RankNTypes#-}
2
3module Pipes.Text.IO
4 ( stdin
5 , stdout
6 , fromHandle
7 , toHandle
8 , readFile
9 , writeFile
10 ) where
11
12import qualified System.IO as IO
13import Control.Exception (throwIO, try)
14import Foreign.C.Error (Errno(Errno), ePIPE)
15import qualified GHC.IO.Exception as G
16import Data.Text (Text)
17import qualified Data.Text as T
18import qualified Data.Text.IO as T
19import Pipes
20import qualified Pipes.Safe.Prelude as Safe
21import qualified Pipes.Safe as Safe
22import Pipes.Safe (MonadSafe(..), Base(..))
23import Prelude hiding (readFile, writeFile)
24
25-- | Stream text from 'stdin'
26stdin :: MonadIO m => Producer Text m ()
27stdin = fromHandle IO.stdin
28{-# INLINE stdin #-}
29
30{-| Convert a 'IO.Handle' into a text stream using a text size
31 determined by the good sense of the text library; note that this
32 is distinctly slower than @decideUtf8 (Pipes.ByteString.fromHandle h)@
33 but uses the system encoding and has other `Data.Text.IO` features
34-}
35
36fromHandle :: MonadIO m => IO.Handle -> Producer Text m ()
37fromHandle h = go where
38 go = do txt <- liftIO (T.hGetChunk h)
39 if T.null txt then return ()
40 else do yield txt
41 go
42{-# INLINABLE fromHandle#-}
43
44
45{-| Stream text from a file in the simple fashion of @Data.Text.IO@
46
47>>> runSafeT $ runEffect $ Text.readFile "hello.hs" >-> Text.map toUpper >-> hoist lift Text.stdout
48MAIN = PUTSTRLN "HELLO WORLD"
49-}
50
51readFile :: MonadSafe m => FilePath -> Producer Text m ()
52readFile file = Safe.withFile file IO.ReadMode fromHandle
53{-# INLINE readFile #-}
54
55
56{-| Stream text to 'stdout'
57
58 Unlike 'toHandle', 'stdout' gracefully terminates on a broken output pipe.
59
60 Note: For best performance, it might be best just to use @(for source (liftIO . putStr))@
61 instead of @(source >-> stdout)@ .
62-}
63stdout :: MonadIO m => Consumer' Text m ()
64stdout = go
65 where
66 go = do
67 txt <- await
68 x <- liftIO $ try (T.putStr txt)
69 case x of
70 Left (G.IOError { G.ioe_type = G.ResourceVanished
71 , G.ioe_errno = Just ioe })
72 | Errno ioe == ePIPE
73 -> return ()
74 Left e -> liftIO (throwIO e)
75 Right () -> go
76{-# INLINABLE stdout #-}
77
78
79{-| Convert a text stream into a 'Handle'
80
81 Note: again, for best performance, where possible use
82 @(for source (liftIO . hPutStr handle))@ instead of @(source >-> toHandle handle)@.
83-}
84toHandle :: MonadIO m => IO.Handle -> Consumer' Text m r
85toHandle h = for cat (liftIO . T.hPutStr h)
86{-# INLINABLE toHandle #-}
87
88{-# RULES "p >-> toHandle h" forall p h .
89 p >-> toHandle h = for p (\txt -> liftIO (T.hPutStr h txt))
90 #-}
91
92
93-- | Stream text into a file. Uses @pipes-safe@.
94writeFile :: (MonadSafe m) => FilePath -> Consumer' Text m ()
95writeFile file = Safe.withFile file IO.WriteMode toHandle
96{-# INLINE writeFile #-}
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
deleted file mode 100644
index 582ef14..0000000
--- a/Pipes/Text/Internal.hs
+++ /dev/null
@@ -1,7 +0,0 @@
1module Pipes.Text.Internal
2 (module Pipes.Text.Internal.Codec
3 , module Pipes.Text.Internal.Decoding
4 ) where
5
6import Pipes.Text.Internal.Codec
7import Pipes.Text.Internal.Decoding \ No newline at end of file
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
deleted file mode 100644
index 075a152..0000000
--- a/Pipes/Text/Internal/Codec.hs
+++ /dev/null
@@ -1,216 +0,0 @@
1
2{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5-- License: MIT
6-- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
7
8-- This module follows the model of the enumerator and conduits libraries, and defines
9-- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
10-- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
11-- special functions appear in @Pipes.Text@
12
13
14module Pipes.Text.Internal.Codec
15 ( Codec(..)
16 , TextException(..)
17 , utf8
18 , utf16_le
19 , utf16_be
20 , utf32_le
21 , utf32_be
22 ) where
23
24import Data.Bits ((.&.))
25import Data.Char (ord)
26import Data.ByteString as B
27import Data.ByteString (ByteString)
28import Data.ByteString.Internal as B
29import Data.ByteString.Char8 as B8
30import Data.Text (Text)
31import qualified Data.Text as T
32import qualified Data.Text.Encoding as TE
33import Data.Text.Encoding.Error ()
34import GHC.Word (Word8, Word32)
35import qualified Data.Text.Array as A
36import Data.Word (Word8, Word16)
37import System.IO.Unsafe (unsafePerformIO)
38import qualified Control.Exception as Exc
39import Data.Bits ((.&.), (.|.), shiftL)
40import Data.Typeable
41import Control.Arrow (first)
42import Data.Maybe (catMaybes)
43import Pipes.Text.Internal.Decoding
44import Pipes
45-- | A specific character encoding.
46
47data Codec = Codec
48 { codecName :: Text
49 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
50 , codecDecode :: ByteString -> Decoding
51 }
52
53instance Show Codec where
54 showsPrec d c = showParen (d > 10) $
55 showString "Codec " . shows (codecName c)
56
57data TextException = DecodeException Codec Word8
58 | EncodeException Codec Char
59 | LengthExceeded Int
60 | TextException Exc.SomeException
61 deriving (Show, Typeable)
62instance Exc.Exception TextException
63
64
65toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
66 -> (ByteString -> Decoding)
67toDecoding op = loop B.empty where
68 loop !extra bs0 = case op (B.append extra bs0) of
69 (txt, Right bs) -> Some txt bs (loop bs)
70 (txt, Left (_,bs)) -> Other txt bs
71-- To do: toDecoding should be inlined in each of the 'Codec' definitions
72-- or else Codec changed to the conduit/enumerator definition. We have
73-- altered it to use 'streamDecodeUtf8'
74
75splitSlowly :: (ByteString -> Text)
76 -> ByteString
77 -> (Text, Either (TextException, ByteString) ByteString)
78splitSlowly dec bytes = valid where
79 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
80 splits 0 = [(B.empty, bytes)]
81 splits n = B.splitAt n bytes : splits (n - 1)
82 decFirst (a, b) = case tryEvaluate (dec a) of
83 Left _ -> Nothing
84 Right text -> let trouble = case tryEvaluate (dec b) of
85 Left exc -> Left (TextException exc, b)
86 Right _ -> Right B.empty
87 in Just (text, trouble) -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
96
97-- -- Whether the given byte is a continuation byte.
98-- isContinuation byte = byte .&. 0xC0 == 0x80
99--
100-- -- The number of continuation bytes needed by the given
101-- -- non-continuation byte. Returns -1 for an illegal UTF-8
102-- -- non-continuation byte and the whole split quickly must fail so
103-- -- as the input is passed to TE.decodeUtf8, which will issue a
104-- -- suitable error.
105-- required x0
106-- | x0 .&. 0x80 == 0x00 = 0
107-- | x0 .&. 0xE0 == 0xC0 = 1
108-- | x0 .&. 0xF0 == 0xE0 = 2
109-- | x0 .&. 0xF8 == 0xF0 = 3
110-- | otherwise = -1
111--
112-- splitQuickly bytes
113-- | B.null l || req == -1 = Nothing
114-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
115-- | otherwise = Just (TE.decodeUtf8 l', r')
116-- where
117-- (l, r) = B.spanEnd isContinuation bytes
118-- req = required (B.last l)
119-- l' = B.init l
120-- r' = B.cons (B.last l) r
121
122
123utf16_le :: Codec
124utf16_le = Codec name enc (toDecoding dec) where
125 name = T.pack "UTF-16-LE"
126 enc text = (TE.encodeUtf16LE text, Nothing)
127 dec bytes = case splitQuickly bytes of
128 Just (text, extra) -> (text, Right extra)
129 Nothing -> splitSlowly TE.decodeUtf16LE bytes
130
131 splitQuickly bytes = maybeDecode (loop 0) where
132 maxN = B.length bytes
133
134 loop n | n == maxN = decodeAll
135 | (n + 1) == maxN = decodeTo n
136 loop n = let
137 req = utf16Required
138 (B.index bytes n)
139 (B.index bytes (n + 1))
140 decodeMore = loop $! n + req
141 in if n + req > maxN
142 then decodeTo n
143 else decodeMore
144
145 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
146 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
147
148utf16_be :: Codec
149utf16_be = Codec name enc (toDecoding dec) where
150 name = T.pack "UTF-16-BE"
151 enc text = (TE.encodeUtf16BE text, Nothing)
152 dec bytes = case splitQuickly bytes of
153 Just (text, extra) -> (text, Right extra)
154 Nothing -> splitSlowly TE.decodeUtf16BE bytes
155
156 splitQuickly bytes = maybeDecode (loop 0) where
157 maxN = B.length bytes
158
159 loop n | n == maxN = decodeAll
160 | (n + 1) == maxN = decodeTo n
161 loop n = let
162 req = utf16Required
163 (B.index bytes (n + 1))
164 (B.index bytes n)
165 decodeMore = loop $! n + req
166 in if n + req > maxN
167 then decodeTo n
168 else decodeMore
169
170 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
171 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
172
173utf16Required :: Word8 -> Word8 -> Int
174utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
175 x :: Word16
176 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
177
178
179utf32_le :: Codec
180utf32_le = Codec name enc (toDecoding dec) where
181 name = T.pack "UTF-32-LE"
182 enc text = (TE.encodeUtf32LE text, Nothing)
183 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
184 Just (text, extra) -> (text, Right extra)
185 Nothing -> splitSlowly TE.decodeUtf32LE bs
186
187
188utf32_be :: Codec
189utf32_be = Codec name enc (toDecoding dec) where
190 name = T.pack "UTF-32-BE"
191 enc text = (TE.encodeUtf32BE text, Nothing)
192 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
193 Just (text, extra) -> (text, Right extra)
194 Nothing -> splitSlowly TE.decodeUtf32BE bs
195
196utf32SplitBytes :: (ByteString -> Text)
197 -> ByteString
198 -> Maybe (Text, ByteString)
199utf32SplitBytes dec bytes = split where
200 split = maybeDecode (dec toDecode, extra)
201 len = B.length bytes
202 lenExtra = mod len 4
203
204 lenToDecode = len - lenExtra
205 (toDecode, extra) = if lenExtra == 0
206 then (bytes, B.empty)
207 else B.splitAt lenToDecode bytes
208
209
210tryEvaluate :: a -> Either Exc.SomeException a
211tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
212
213maybeDecode :: (a, b) -> Maybe (a, b)
214maybeDecode (a, b) = case tryEvaluate a of
215 Left _ -> Nothing
216 Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
deleted file mode 100644
index b5d928a..0000000
--- a/Pipes/Text/Internal/Decoding.hs
+++ /dev/null
@@ -1,154 +0,0 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5-- This module lifts assorted materials from Brian O'Sullivan's text package
6-- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
7-- 'streamDecodeUtf8'
8
9module Pipes.Text.Internal.Decoding
10 ( Decoding(..)
11 , streamDecodeUtf8
12 , decodeSomeUtf8
13 ) where
14import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15import Control.Monad.ST (ST, runST)
16import Data.Bits ((.&.))
17import Data.ByteString as B
18import Data.ByteString (ByteString)
19import Data.ByteString.Internal as B
20import Data.ByteString.Char8 as B8
21import Data.Text (Text)
22import qualified Data.Text as T
23import qualified Data.Text.Encoding as TE
24import Data.Text.Encoding.Error ()
25import Data.Text.Internal (Text, textP)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
32import GHC.Word (Word8, Word32)
33import qualified Data.Text.Array as A
34import Data.Word (Word8, Word16)
35import System.IO.Unsafe (unsafePerformIO)
36import qualified Control.Exception as Exc
37import Data.Bits ((.&.), (.|.), shiftL)
38import Data.Typeable
39import Control.Arrow (first)
40import Data.Maybe (catMaybes)
41#include "pipes_text_cbits.h"
42
43
44
45-- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
46
47data Decoding = Some Text ByteString (ByteString -> Decoding)
48 -- Text, continuation and any undecoded fragment.
49 | Other Text ByteString
50 -- Text followed by an undecodable ByteString
51
52instance Show Decoding where
53 showsPrec d (Some t bs _) = showParen (d > prec) $
54 showString "Some " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
56 showString " _"
57 where prec = 10; prec' = prec + 1
58 showsPrec d (Other t bs) = showParen (d > prec) $
59 showString "Other " . showsPrec prec' t .
60 showChar ' ' . showsPrec prec' bs .
61 showString " _"
62 where prec = 10; prec' = prec + 1
63
64newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
65newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
66
67-- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'.
68streamDecodeUtf8 :: ByteString -> Decoding
69streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
70 where
71 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
72 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
73 runST $ do marray <- A.new (len+1)
74 unsafeIOToST (decodeChunkToBuffer marray)
75 where
76 decodeChunkToBuffer :: A.MArray s -> IO Decoding
77 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
78 with (0::CSize) $ \destOffPtr ->
79 with codepoint0 $ \codepointPtr ->
80 with state0 $ \statePtr ->
81 with nullPtr $ \curPtrPtr ->
82 do let end = ptr `plusPtr` (off + len)
83 curPtr = ptr `plusPtr` off
84 poke curPtrPtr curPtr
85 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
86 state <- peek statePtr
87 lastPtr <- peek curPtrPtr
88 codepoint <- peek codepointPtr
89 n <- peek destOffPtr
90 chunkText <- mkText dest n
91 let left = lastPtr `minusPtr` curPtr
92 remaining = B.drop left bs
93 accum = if T.null chunkText then B.append old remaining else remaining
94 return $! case state of
95 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
96 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
97 {-# INLINE decodeChunkToBuffer #-}
98 {-# INLINE decodeChunkUtf8 #-}
99{-# INLINE streamDecodeUtf8 #-}
100
101-- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
102decodeSomeUtf8 :: ByteString -> (Text, ByteString)
103decodeSomeUtf8 bs@(PS fp off len) = runST $ do
104 dest <- A.new (len+1)
105 unsafeIOToST $
106 withForeignPtr fp $ \ptr ->
107 with (0::CSize) $ \destOffPtr ->
108 with (0::CodePoint) $ \codepointPtr ->
109 with (0::DecoderState) $ \statePtr ->
110 with nullPtr $ \curPtrPtr ->
111 do let end = ptr `plusPtr` (off + len)
112 curPtr = ptr `plusPtr` off
113 poke curPtrPtr curPtr
114 c_decode_utf8_with_state (A.maBA dest) destOffPtr
115 curPtrPtr end codepointPtr statePtr
116 state <- peek statePtr
117 lastPtr <- peek curPtrPtr
118 codepoint <- peek codepointPtr
119 n <- peek destOffPtr
120 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
121 return $! textP arr 0 (fromIntegral n)
122 let left = lastPtr `minusPtr` curPtr
123 remaining = B.drop left bs
124 return $! (chunkText, remaining)
125{-# INLINE decodeSomeUtf8 #-}
126
127mkText :: A.MArray s -> CSize -> IO Text
128mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
129 return $! textP arr 0 (fromIntegral n)
130{-# INLINE mkText #-}
131
132ord :: Char -> Int
133ord (C# c#) = I# (ord# c#)
134{-# INLINE ord #-}
135
136unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
137unsafeWrite marr i c
138 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
139 return 1
140 | otherwise = do A.unsafeWrite marr i lo
141 A.unsafeWrite marr (i+1) hi
142 return 2
143 where n = ord c
144 m = n - 0x10000
145 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
146 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
147 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
148 {-# INLINE shiftR #-}
149{-# INLINE unsafeWrite #-}
150
151foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
152 :: MutableByteArray# s -> Ptr CSize
153 -> Ptr (Ptr Word8) -> Ptr Word8
154 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file