aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text')
-rw-r--r--Pipes/Text/Internal.hs264
-rw-r--r--Pipes/Text/Parse.hs18
2 files changed, 239 insertions, 43 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 7e5b044..76c2f4f 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -1,5 +1,7 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, 1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2 UnliftedFFITypes #-} 2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
3-- This module lifts assorted materials from Brian O'Sullivan's text package 5-- This module lifts assorted materials from Brian O'Sullivan's text package
4-- especially Data.Text.Encoding in order to define a pipes-appropriate 6-- especially Data.Text.Encoding in order to define a pipes-appropriate
5-- streamDecodeUtf8 7-- streamDecodeUtf8
@@ -7,13 +9,20 @@ module Pipes.Text.Internal
7 ( Decoding(..) 9 ( Decoding(..)
8 , streamDecodeUtf8 10 , streamDecodeUtf8
9 , decodeSomeUtf8 11 , decodeSomeUtf8
12 , Codec(..)
13 , TextException(..)
14 , utf8
10 ) where 15 ) where
11import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) 16import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
12import Control.Monad.ST (ST, runST) 17import Control.Monad.ST (ST, runST)
13import Data.Bits ((.&.)) 18import Data.Bits ((.&.))
14import Data.ByteString as B 19import Data.ByteString as B
20import Data.ByteString (ByteString)
15import Data.ByteString.Internal as B 21import Data.ByteString.Internal as B
16import qualified Data.Text as T (null) 22import Data.ByteString.Char8 as B8
23import Data.Text (Text)
24import qualified Data.Text as T
25import qualified Data.Text.Encoding as TE
17import Data.Text.Encoding.Error () 26import Data.Text.Encoding.Error ()
18import Data.Text.Internal (Text, textP) 27import Data.Text.Internal (Text, textP)
19import Foreign.C.Types (CSize) 28import Foreign.C.Types (CSize)
@@ -24,9 +33,226 @@ import Foreign.Storable (Storable, peek, poke)
24import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#) 33import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
25import GHC.Word (Word8, Word32) 34import GHC.Word (Word8, Word32)
26import qualified Data.Text.Array as A 35import qualified Data.Text.Array as A
27 36import Data.Word (Word8, Word16)
37import System.IO.Unsafe (unsafePerformIO)
38import qualified Control.Exception as Exc
39import Data.Bits ((.&.), (.|.), shiftL)
40import Data.Typeable
41import Control.Arrow (first)
42import Data.Maybe (catMaybes)
28#include "pipes_text_cbits.h" 43#include "pipes_text_cbits.h"
29 44
45
46-- | A specific character encoding.
47--
48-- Since 0.3.0
49data Codec = Codec
50 { codecName :: Text
51 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
53 }
54
55instance Show Codec where
56 showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
57
58-- Since 0.3.0
59data TextException = DecodeException Codec Word8
60 | EncodeException Codec Char
61 | LengthExceeded Int
62 | TextException Exc.SomeException
63 deriving (Show, Typeable)
64instance Exc.Exception TextException
65
66toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
67 -> (ByteString -> Decoding)
68toDecoding op = loop B.empty where
69 loop extra bs0 = case op (B.append extra bs0) of
70 (txt, Right bs) -> Some txt bs (loop bs)
71 (txt, Left (_,bs)) -> Other txt bs
72
73
74splitSlowly :: (ByteString -> Text)
75 -> ByteString
76 -> (Text, Either (TextException, ByteString) ByteString)
77splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
82 Left _ -> Nothing
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble)
87 -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of
96 (t,b) -> (t, Right b)
97
98-- -- Whether the given byte is a continuation byte.
99-- isContinuation byte = byte .&. 0xC0 == 0x80
100--
101-- -- The number of continuation bytes needed by the given
102-- -- non-continuation byte. Returns -1 for an illegal UTF-8
103-- -- non-continuation byte and the whole split quickly must fail so
104-- -- as the input is passed to TE.decodeUtf8, which will issue a
105-- -- suitable error.
106-- required x0
107-- | x0 .&. 0x80 == 0x00 = 0
108-- | x0 .&. 0xE0 == 0xC0 = 1
109-- | x0 .&. 0xF0 == 0xE0 = 2
110-- | x0 .&. 0xF8 == 0xF0 = 3
111-- | otherwise = -1
112--
113-- splitQuickly bytes
114-- | B.null l || req == -1 = Nothing
115-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
116-- | otherwise = Just (TE.decodeUtf8 l', r')
117-- where
118-- (l, r) = B.spanEnd isContinuation bytes
119-- req = required (B.last l)
120-- l' = B.init l
121-- r' = B.cons (B.last l) r
122
123-- |
124-- Since 0.3.0
125utf16_le :: Codec
126utf16_le = Codec name enc (toDecoding dec) where
127 name = T.pack "UTF-16-LE"
128 enc text = (TE.encodeUtf16LE text, Nothing)
129 dec bytes = case splitQuickly bytes of
130 Just (text, extra) -> (text, Right extra)
131 Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133 splitQuickly bytes = maybeDecode (loop 0) where
134 maxN = B.length bytes
135
136 loop n | n == maxN = decodeAll
137 | (n + 1) == maxN = decodeTo n
138 loop n = let
139 req = utf16Required
140 (B.index bytes n)
141 (B.index bytes (n + 1))
142 decodeMore = loop $! n + req
143 in if n + req > maxN
144 then decodeTo n
145 else decodeMore
146
147 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150-- |
151-- Since 0.3.0
152utf16_be :: Codec
153utf16_be = Codec name enc (toDecoding dec) where
154 name = T.pack "UTF-16-BE"
155 enc text = (TE.encodeUtf16BE text, Nothing)
156 dec bytes = case splitQuickly bytes of
157 Just (text, extra) -> (text, Right extra)
158 Nothing -> splitSlowly TE.decodeUtf16BE bytes
159
160 splitQuickly bytes = maybeDecode (loop 0) where
161 maxN = B.length bytes
162
163 loop n | n == maxN = decodeAll
164 | (n + 1) == maxN = decodeTo n
165 loop n = let
166 req = utf16Required
167 (B.index bytes (n + 1))
168 (B.index bytes n)
169 decodeMore = loop $! n + req
170 in if n + req > maxN
171 then decodeTo n
172 else decodeMore
173
174 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
175 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
176
177utf16Required :: Word8 -> Word8 -> Int
178utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
179 x :: Word16
180 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
181
182-- |
183-- Since 0.3.0
184utf32_le :: Codec
185utf32_le = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-LE"
187 enc text = (TE.encodeUtf32LE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32LE bs
191
192-- |
193-- Since 0.3.0
194utf32_be :: Codec
195utf32_be = Codec name enc (toDecoding dec) where
196 name = T.pack "UTF-32-BE"
197 enc text = (TE.encodeUtf32BE text, Nothing)
198 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
199 Just (text, extra) -> (text, Right extra)
200 Nothing -> splitSlowly TE.decodeUtf32BE bs
201
202utf32SplitBytes :: (ByteString -> Text)
203 -> ByteString
204 -> Maybe (Text, ByteString)
205utf32SplitBytes dec bytes = split where
206 split = maybeDecode (dec toDecode, extra)
207 len = B.length bytes
208 lenExtra = mod len 4
209
210 lenToDecode = len - lenExtra
211 (toDecode, extra) = if lenExtra == 0
212 then (bytes, B.empty)
213 else B.splitAt lenToDecode bytes
214
215-- |
216-- Since 0.3.0
217ascii :: Codec
218ascii = Codec name enc (toDecoding dec) where
219 name = T.pack "ASCII"
220 enc text = (bytes, extra) where
221 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
222 bytes = B8.pack (T.unpack safe)
223 extra = if T.null unsafe
224 then Nothing
225 else Just (EncodeException ascii (T.head unsafe), unsafe)
226
227 dec bytes = (text, extra) where
228 (safe, unsafe) = B.span (<= 0x7F) bytes
229 text = T.pack (B8.unpack safe)
230 extra = if B.null unsafe
231 then Right B.empty
232 else Left (DecodeException ascii (B.head unsafe), unsafe)
233
234-- |
235-- Since 0.3.0
236iso8859_1 :: Codec
237iso8859_1 = Codec name enc (toDecoding dec) where
238 name = T.pack "ISO-8859-1"
239 enc text = (bytes, extra) where
240 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
241 bytes = B8.pack (T.unpack safe)
242 extra = if T.null unsafe
243 then Nothing
244 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
245
246 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
247
248tryEvaluate :: a -> Either Exc.SomeException a
249tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
250
251maybeDecode :: (a, b) -> Maybe (a, b)
252maybeDecode (a, b) = case tryEvaluate a of
253 Left _ -> Nothing
254 Right _ -> Just (a, b)
255
30-- | A stream oriented decoding result. 256-- | A stream oriented decoding result.
31data Decoding = Some Text ByteString (ByteString -> Decoding) 257data Decoding = Some Text ByteString (ByteString -> Decoding)
32 | Other Text ByteString 258 | Other Text ByteString
@@ -103,36 +329,6 @@ decodeSomeUtf8 bs@(PS fp off len) = runST $ do
103 return $! (chunkText, remaining) 329 return $! (chunkText, remaining)
104{-# INLINE decodeSomeUtf8 #-} 330{-# INLINE decodeSomeUtf8 #-}
105 331
106-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
107-- decodeSomeUtf8 bs@(PS fp off len) =
108-- runST $ do marray <- A.new (len+1)
109-- unsafeIOToST (decodeChunkToBuffer marray)
110--
111-- where
112-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
113-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
114-- with (0::CSize) $ \destOffPtr ->
115-- with (0::CodePoint) $ \codepointPtr ->
116-- with (0::DecoderState) $ \statePtr ->
117-- with nullPtr $ \curPtrPtr ->
118-- do let end = ptr `plusPtr` (off + len)
119-- curPtr = ptr `plusPtr` off
120-- poke curPtrPtr curPtr
121-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
122-- state <- peek statePtr
123-- lastPtr <- peek curPtrPtr
124-- codepoint <- peek codepointPtr
125-- n <- peek destOffPtr
126-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
127-- return $! textP arr 0 (fromIntegral n)
128-- let left = lastPtr `minusPtr` curPtr
129-- remaining = B.drop left bs
130-- return $! (chunkText, remaining)
131-- {-# INLINE decodeChunkToBuffer #-}
132-- {-# INLINE decodeSomeUtf8 #-}
133
134
135
136mkText :: A.MArray s -> CSize -> IO Text 332mkText :: A.MArray s -> CSize -> IO Text
137mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest 333mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
138 return $! textP arr 0 (fromIntegral n) 334 return $! textP arr 0 (fromIntegral n)
diff --git a/Pipes/Text/Parse.hs b/Pipes/Text/Parse.hs
index ed0afa1..9cabaa6 100644
--- a/Pipes/Text/Parse.hs
+++ b/Pipes/Text/Parse.hs
@@ -44,16 +44,16 @@ nextChar = go
44{-| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the 44{-| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the
45 'Producer' is empty 45 'Producer' is empty
46-} 46-}
47drawChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char) 47drawChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char)
48drawChar = do 48drawChar = do
49 x <- PP.draw 49 x <- PP.draw
50 case x of 50 case x of
51 Left r -> return (Left r) 51 Nothing -> return Nothing
52 Right txt -> case (T.uncons txt) of 52 Just txt -> case (T.uncons txt) of
53 Nothing -> drawChar 53 Nothing -> drawChar
54 Just (c, txt') -> do 54 Just (c, txt') -> do
55 PP.unDraw txt' 55 PP.unDraw txt'
56 return (Right c) 56 return (Just c)
57{-# INLINABLE drawChar #-} 57{-# INLINABLE drawChar #-}
58 58
59-- | Push back a 'Char' onto the underlying 'Producer' 59-- | Push back a 'Char' onto the underlying 'Producer'
@@ -71,12 +71,12 @@ unDrawChar c = modify (yield (T.singleton c) >>)
71> Right c -> unDrawChar c 71> Right c -> unDrawChar c
72> return x 72> return x
73-} 73-}
74peekChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char) 74peekChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char)
75peekChar = do 75peekChar = do
76 x <- drawChar 76 x <- drawChar
77 case x of 77 case x of
78 Left _ -> return () 78 Nothing -> return ()
79 Right c -> unDrawChar c 79 Just c -> unDrawChar c
80 return x 80 return x
81{-# INLINABLE peekChar #-} 81{-# INLINABLE peekChar #-}
82 82
@@ -91,8 +91,8 @@ isEndOfChars :: (Monad m) => StateT (Producer Text m r) m Bool
91isEndOfChars = do 91isEndOfChars = do
92 x <- peekChar 92 x <- peekChar
93 return (case x of 93 return (case x of
94 Left _ -> True 94 Nothing -> True
95 Right _ -> False ) 95 Just _-> False )
96{-# INLINABLE isEndOfChars #-} 96{-# INLINABLE isEndOfChars #-}
97 97
98{-| @(take n)@ only allows @n@ characters to pass 98{-| @(take n)@ only allows @n@ characters to pass