diff options
Diffstat (limited to 'Pipes')
-rw-r--r-- | Pipes/Text.hs | 5 | ||||
-rw-r--r-- | Pipes/Text/Internal/Codec.hs | 215 | ||||
-rw-r--r-- | Pipes/Text/Internal/Decoding.hs | 147 |
3 files changed, 365 insertions, 2 deletions
diff --git a/Pipes/Text.hs b/Pipes/Text.hs index 18ec8ec..0957a7d 100644 --- a/Pipes/Text.hs +++ b/Pipes/Text.hs | |||
@@ -167,6 +167,7 @@ module Pipes.Text ( | |||
167 | , module Data.Word | 167 | , module Data.Word |
168 | , module Pipes.Parse | 168 | , module Pipes.Parse |
169 | , module Pipes.Group | 169 | , module Pipes.Group |
170 | , module Pipes.Text.Internal.Codec | ||
170 | ) where | 171 | ) where |
171 | 172 | ||
172 | import Control.Exception (throwIO, try) | 173 | import Control.Exception (throwIO, try) |
@@ -196,8 +197,8 @@ import Foreign.C.Error (Errno(Errno), ePIPE) | |||
196 | import qualified GHC.IO.Exception as G | 197 | import qualified GHC.IO.Exception as G |
197 | import Pipes | 198 | import Pipes |
198 | import qualified Pipes.ByteString as PB | 199 | import qualified Pipes.ByteString as PB |
199 | import qualified Pipes.Text.Internal as PE | 200 | import qualified Pipes.Text.Internal.Decoding as PE |
200 | import Pipes.Text.Codec | 201 | import Pipes.Text.Internal.Codec |
201 | import Pipes.Core (respond, Server') | 202 | import Pipes.Core (respond, Server') |
202 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) | 203 | import Pipes.Group (concats, intercalates, FreeT(..), FreeF(..)) |
203 | import qualified Pipes.Group as PG | 204 | import qualified Pipes.Group as PG |
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs new file mode 100644 index 0000000..4b9367f --- /dev/null +++ b/Pipes/Text/Internal/Codec.hs | |||
@@ -0,0 +1,215 @@ | |||
1 | |||
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} | ||
3 | -- | | ||
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | ||
5 | -- License: MIT | ||
6 | -- | ||
7 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. | ||
8 | |||
9 | module Pipes.Text.Internal.Codec | ||
10 | ( Decoding(..) | ||
11 | , streamDecodeUtf8 | ||
12 | , decodeSomeUtf8 | ||
13 | , Codec(..) | ||
14 | , TextException(..) | ||
15 | , utf8 | ||
16 | , utf16_le | ||
17 | , utf16_be | ||
18 | , utf32_le | ||
19 | , utf32_be | ||
20 | ) where | ||
21 | |||
22 | import Data.Bits ((.&.)) | ||
23 | import Data.Char (ord) | ||
24 | import Data.ByteString as B | ||
25 | import Data.ByteString (ByteString) | ||
26 | import Data.ByteString.Internal as B | ||
27 | import Data.ByteString.Char8 as B8 | ||
28 | import Data.Text (Text) | ||
29 | import qualified Data.Text as T | ||
30 | import qualified Data.Text.Encoding as TE | ||
31 | import Data.Text.Encoding.Error () | ||
32 | import GHC.Word (Word8, Word32) | ||
33 | import qualified Data.Text.Array as A | ||
34 | import Data.Word (Word8, Word16) | ||
35 | import System.IO.Unsafe (unsafePerformIO) | ||
36 | import qualified Control.Exception as Exc | ||
37 | import Data.Bits ((.&.), (.|.), shiftL) | ||
38 | import Data.Typeable | ||
39 | import Control.Arrow (first) | ||
40 | import Data.Maybe (catMaybes) | ||
41 | import Pipes.Text.Internal.Decoding | ||
42 | import Pipes | ||
43 | -- | A specific character encoding. | ||
44 | -- | ||
45 | -- Since 0.3.0 | ||
46 | data Codec = Codec | ||
47 | { codecName :: Text | ||
48 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | ||
49 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | ||
50 | } | ||
51 | |||
52 | instance Show Codec where | ||
53 | showsPrec d c = showParen (d > 10) $ | ||
54 | showString "Codec " . shows (codecName c) | ||
55 | |||
56 | data TextException = DecodeException Codec Word8 | ||
57 | | EncodeException Codec Char | ||
58 | | LengthExceeded Int | ||
59 | | TextException Exc.SomeException | ||
60 | deriving (Show, Typeable) | ||
61 | instance Exc.Exception TextException | ||
62 | |||
63 | |||
64 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | ||
65 | -> (ByteString -> Decoding) | ||
66 | toDecoding op = loop B.empty where | ||
67 | loop !extra bs0 = case op (B.append extra bs0) of | ||
68 | (txt, Right bs) -> Some txt bs (loop bs) | ||
69 | (txt, Left (_,bs)) -> Other txt bs | ||
70 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | ||
71 | -- or else Codec changed to the conduit/enumerator definition. We have | ||
72 | -- altered it to use 'streamDecodeUtf8' | ||
73 | |||
74 | splitSlowly :: (ByteString -> Text) | ||
75 | -> ByteString | ||
76 | -> (Text, Either (TextException, ByteString) ByteString) | ||
77 | splitSlowly dec bytes = valid where | ||
78 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | ||
79 | splits 0 = [(B.empty, bytes)] | ||
80 | splits n = B.splitAt n bytes : splits (n - 1) | ||
81 | decFirst (a, b) = case tryEvaluate (dec a) of | ||
82 | Left _ -> Nothing | ||
83 | Right text -> let trouble = case tryEvaluate (dec b) of | ||
84 | Left exc -> Left (TextException exc, b) | ||
85 | Right _ -> Right B.empty | ||
86 | in Just (text, trouble) -- this case shouldn't occur, | ||
87 | -- since splitSlowly is only called | ||
88 | -- when parsing failed somewhere | ||
89 | |||
90 | utf8 :: Codec | ||
91 | utf8 = Codec name enc (toDecoding dec) where | ||
92 | name = T.pack "UTF-8" | ||
93 | enc text = (TE.encodeUtf8 text, Nothing) | ||
94 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) | ||
95 | |||
96 | -- -- Whether the given byte is a continuation byte. | ||
97 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | ||
98 | -- | ||
99 | -- -- The number of continuation bytes needed by the given | ||
100 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | ||
101 | -- -- non-continuation byte and the whole split quickly must fail so | ||
102 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | ||
103 | -- -- suitable error. | ||
104 | -- required x0 | ||
105 | -- | x0 .&. 0x80 == 0x00 = 0 | ||
106 | -- | x0 .&. 0xE0 == 0xC0 = 1 | ||
107 | -- | x0 .&. 0xF0 == 0xE0 = 2 | ||
108 | -- | x0 .&. 0xF8 == 0xF0 = 3 | ||
109 | -- | otherwise = -1 | ||
110 | -- | ||
111 | -- splitQuickly bytes | ||
112 | -- | B.null l || req == -1 = Nothing | ||
113 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | ||
114 | -- | otherwise = Just (TE.decodeUtf8 l', r') | ||
115 | -- where | ||
116 | -- (l, r) = B.spanEnd isContinuation bytes | ||
117 | -- req = required (B.last l) | ||
118 | -- l' = B.init l | ||
119 | -- r' = B.cons (B.last l) r | ||
120 | |||
121 | |||
122 | utf16_le :: Codec | ||
123 | utf16_le = Codec name enc (toDecoding dec) where | ||
124 | name = T.pack "UTF-16-LE" | ||
125 | enc text = (TE.encodeUtf16LE text, Nothing) | ||
126 | dec bytes = case splitQuickly bytes of | ||
127 | Just (text, extra) -> (text, Right extra) | ||
128 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | ||
129 | |||
130 | splitQuickly bytes = maybeDecode (loop 0) where | ||
131 | maxN = B.length bytes | ||
132 | |||
133 | loop n | n == maxN = decodeAll | ||
134 | | (n + 1) == maxN = decodeTo n | ||
135 | loop n = let | ||
136 | req = utf16Required | ||
137 | (B.index bytes n) | ||
138 | (B.index bytes (n + 1)) | ||
139 | decodeMore = loop $! n + req | ||
140 | in if n + req > maxN | ||
141 | then decodeTo n | ||
142 | else decodeMore | ||
143 | |||
144 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | ||
145 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | ||
146 | |||
147 | utf16_be :: Codec | ||
148 | utf16_be = Codec name enc (toDecoding dec) where | ||
149 | name = T.pack "UTF-16-BE" | ||
150 | enc text = (TE.encodeUtf16BE text, Nothing) | ||
151 | dec bytes = case splitQuickly bytes of | ||
152 | Just (text, extra) -> (text, Right extra) | ||
153 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | ||
154 | |||
155 | splitQuickly bytes = maybeDecode (loop 0) where | ||
156 | maxN = B.length bytes | ||
157 | |||
158 | loop n | n == maxN = decodeAll | ||
159 | | (n + 1) == maxN = decodeTo n | ||
160 | loop n = let | ||
161 | req = utf16Required | ||
162 | (B.index bytes (n + 1)) | ||
163 | (B.index bytes n) | ||
164 | decodeMore = loop $! n + req | ||
165 | in if n + req > maxN | ||
166 | then decodeTo n | ||
167 | else decodeMore | ||
168 | |||
169 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | ||
170 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | ||
171 | |||
172 | utf16Required :: Word8 -> Word8 -> Int | ||
173 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | ||
174 | x :: Word16 | ||
175 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | ||
176 | |||
177 | |||
178 | utf32_le :: Codec | ||
179 | utf32_le = Codec name enc (toDecoding dec) where | ||
180 | name = T.pack "UTF-32-LE" | ||
181 | enc text = (TE.encodeUtf32LE text, Nothing) | ||
182 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | ||
183 | Just (text, extra) -> (text, Right extra) | ||
184 | Nothing -> splitSlowly TE.decodeUtf32LE bs | ||
185 | |||
186 | |||
187 | utf32_be :: Codec | ||
188 | utf32_be = Codec name enc (toDecoding dec) where | ||
189 | name = T.pack "UTF-32-BE" | ||
190 | enc text = (TE.encodeUtf32BE text, Nothing) | ||
191 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | ||
192 | Just (text, extra) -> (text, Right extra) | ||
193 | Nothing -> splitSlowly TE.decodeUtf32BE bs | ||
194 | |||
195 | utf32SplitBytes :: (ByteString -> Text) | ||
196 | -> ByteString | ||
197 | -> Maybe (Text, ByteString) | ||
198 | utf32SplitBytes dec bytes = split where | ||
199 | split = maybeDecode (dec toDecode, extra) | ||
200 | len = B.length bytes | ||
201 | lenExtra = mod len 4 | ||
202 | |||
203 | lenToDecode = len - lenExtra | ||
204 | (toDecode, extra) = if lenExtra == 0 | ||
205 | then (bytes, B.empty) | ||
206 | else B.splitAt lenToDecode bytes | ||
207 | |||
208 | |||
209 | tryEvaluate :: a -> Either Exc.SomeException a | ||
210 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | ||
211 | |||
212 | maybeDecode :: (a, b) -> Maybe (a, b) | ||
213 | maybeDecode (a, b) = case tryEvaluate a of | ||
214 | Left _ -> Nothing | ||
215 | Right _ -> Just (a, b) | ||
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs new file mode 100644 index 0000000..531104a --- /dev/null +++ b/Pipes/Text/Internal/Decoding.hs | |||
@@ -0,0 +1,147 @@ | |||
1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-} | ||
2 | {-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} | ||
3 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | ||
4 | |||
5 | -- This module lifts assorted materials from Brian O'Sullivan's text package | ||
6 | -- especially Data.Text.Encoding in order to define a pipes-appropriate | ||
7 | -- streamDecodeUtf8 | ||
8 | module Pipes.Text.Internal.Decoding | ||
9 | ( Decoding(..) | ||
10 | , streamDecodeUtf8 | ||
11 | , decodeSomeUtf8 | ||
12 | ) where | ||
13 | import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) | ||
14 | import Control.Monad.ST (ST, runST) | ||
15 | import Data.Bits ((.&.)) | ||
16 | import Data.ByteString as B | ||
17 | import Data.ByteString (ByteString) | ||
18 | import Data.ByteString.Internal as B | ||
19 | import Data.ByteString.Char8 as B8 | ||
20 | import Data.Text (Text) | ||
21 | import qualified Data.Text as T | ||
22 | import qualified Data.Text.Encoding as TE | ||
23 | import Data.Text.Encoding.Error () | ||
24 | import Data.Text.Internal (Text, textP) | ||
25 | import Foreign.C.Types (CSize) | ||
26 | import Foreign.ForeignPtr (withForeignPtr) | ||
27 | import Foreign.Marshal.Utils (with) | ||
28 | import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) | ||
29 | import Foreign.Storable (Storable, peek, poke) | ||
30 | import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) | ||
31 | import GHC.Word (Word8, Word32) | ||
32 | import qualified Data.Text.Array as A | ||
33 | import Data.Word (Word8, Word16) | ||
34 | import System.IO.Unsafe (unsafePerformIO) | ||
35 | import qualified Control.Exception as Exc | ||
36 | import Data.Bits ((.&.), (.|.), shiftL) | ||
37 | import Data.Typeable | ||
38 | import Control.Arrow (first) | ||
39 | import Data.Maybe (catMaybes) | ||
40 | #include "pipes_text_cbits.h" | ||
41 | |||
42 | |||
43 | |||
44 | -- | A stream oriented decoding result. | ||
45 | data Decoding = Some Text ByteString (ByteString -> Decoding) | ||
46 | | Other Text ByteString | ||
47 | instance Show Decoding where | ||
48 | showsPrec d (Some t bs _) = showParen (d > prec) $ | ||
49 | showString "Some " . showsPrec prec' t . | ||
50 | showChar ' ' . showsPrec prec' bs . | ||
51 | showString " _" | ||
52 | where prec = 10; prec' = prec + 1 | ||
53 | showsPrec d (Other t bs) = showParen (d > prec) $ | ||
54 | showString "Other " . showsPrec prec' t . | ||
55 | showChar ' ' . showsPrec prec' bs . | ||
56 | showString " _" | ||
57 | where prec = 10; prec' = prec + 1 | ||
58 | |||
59 | newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) | ||
60 | newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) | ||
61 | |||
62 | streamDecodeUtf8 :: ByteString -> Decoding | ||
63 | streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 | ||
64 | where | ||
65 | decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding | ||
66 | decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = | ||
67 | runST $ do marray <- A.new (len+1) | ||
68 | unsafeIOToST (decodeChunkToBuffer marray) | ||
69 | where | ||
70 | decodeChunkToBuffer :: A.MArray s -> IO Decoding | ||
71 | decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> | ||
72 | with (0::CSize) $ \destOffPtr -> | ||
73 | with codepoint0 $ \codepointPtr -> | ||
74 | with state0 $ \statePtr -> | ||
75 | with nullPtr $ \curPtrPtr -> | ||
76 | do let end = ptr `plusPtr` (off + len) | ||
77 | curPtr = ptr `plusPtr` off | ||
78 | poke curPtrPtr curPtr | ||
79 | c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr | ||
80 | state <- peek statePtr | ||
81 | lastPtr <- peek curPtrPtr | ||
82 | codepoint <- peek codepointPtr | ||
83 | n <- peek destOffPtr | ||
84 | chunkText <- mkText dest n | ||
85 | let left = lastPtr `minusPtr` curPtr | ||
86 | remaining = B.drop left bs | ||
87 | accum = if T.null chunkText then B.append old remaining else remaining | ||
88 | return $! case state of | ||
89 | UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error | ||
90 | _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state) | ||
91 | {-# INLINE decodeChunkToBuffer #-} | ||
92 | {-# INLINE decodeChunkUtf8 #-} | ||
93 | {-# INLINE streamDecodeUtf8 #-} | ||
94 | |||
95 | decodeSomeUtf8 :: ByteString -> (Text, ByteString) | ||
96 | decodeSomeUtf8 bs@(PS fp off len) = runST $ do | ||
97 | dest <- A.new (len+1) | ||
98 | unsafeIOToST $ | ||
99 | withForeignPtr fp $ \ptr -> | ||
100 | with (0::CSize) $ \destOffPtr -> | ||
101 | with (0::CodePoint) $ \codepointPtr -> | ||
102 | with (0::DecoderState) $ \statePtr -> | ||
103 | with nullPtr $ \curPtrPtr -> | ||
104 | do let end = ptr `plusPtr` (off + len) | ||
105 | curPtr = ptr `plusPtr` off | ||
106 | poke curPtrPtr curPtr | ||
107 | c_decode_utf8_with_state (A.maBA dest) destOffPtr | ||
108 | curPtrPtr end codepointPtr statePtr | ||
109 | state <- peek statePtr | ||
110 | lastPtr <- peek curPtrPtr | ||
111 | codepoint <- peek codepointPtr | ||
112 | n <- peek destOffPtr | ||
113 | chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
114 | return $! textP arr 0 (fromIntegral n) | ||
115 | let left = lastPtr `minusPtr` curPtr | ||
116 | remaining = B.drop left bs | ||
117 | return $! (chunkText, remaining) | ||
118 | {-# INLINE decodeSomeUtf8 #-} | ||
119 | |||
120 | mkText :: A.MArray s -> CSize -> IO Text | ||
121 | mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest | ||
122 | return $! textP arr 0 (fromIntegral n) | ||
123 | {-# INLINE mkText #-} | ||
124 | |||
125 | ord :: Char -> Int | ||
126 | ord (C# c#) = I# (ord# c#) | ||
127 | {-# INLINE ord #-} | ||
128 | |||
129 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int | ||
130 | unsafeWrite marr i c | ||
131 | | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n) | ||
132 | return 1 | ||
133 | | otherwise = do A.unsafeWrite marr i lo | ||
134 | A.unsafeWrite marr (i+1) hi | ||
135 | return 2 | ||
136 | where n = ord c | ||
137 | m = n - 0x10000 | ||
138 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 | ||
139 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 | ||
140 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) | ||
141 | {-# INLINE shiftR #-} | ||
142 | {-# INLINE unsafeWrite #-} | ||
143 | |||
144 | foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state | ||
145 | :: MutableByteArray# s -> Ptr CSize | ||
146 | -> Ptr (Ptr Word8) -> Ptr Word8 | ||
147 | -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file | ||