renamed fold foldChars and began updating documentation

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-01-25 21:42:54 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-01-25 21:42:54 -0500
commit: 64e03122e6ecc4898cb1b193cdcf3b26d3e71b14 (patch)
tree: 63b707950efcd92db00ac6979b792b9f30627e06 /Pipes/Text
parent: 7ded3267a3b62ff896ea22262549f9511273c45f (diff)
download: text-pipes-64e03122e6ecc4898cb1b193cdcf3b26d3e71b14.tar.gz
text-pipes-64e03122e6ecc4898cb1b193cdcf3b26d3e71b14.tar.zst
text-pipes-64e03122e6ecc4898cb1b193cdcf3b26d3e71b14.zip
2 files changed, 239 insertions, 43 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 7e5b044..76c2f4f 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -1,5 +1,7 @@
-{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
+{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
-    UnliftedFFITypes #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
+{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
 -- This module lifts assorted materials from Brian O'Sullivan's text package 
 -- especially Data.Text.Encoding in order to define a pipes-appropriate
 -- streamDecodeUtf8
@@ -7,13 +9,20 @@ module Pipes.Text.Internal
    ( Decoding(..)
    , streamDecodeUtf8
    , decodeSomeUtf8
+    , Codec(..)
+    , TextException(..)
+    , utf8
    ) where
 import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 import Control.Monad.ST (ST, runST)
 import Data.Bits ((.&.))
 import Data.ByteString as B 
+import Data.ByteString (ByteString)
 import Data.ByteString.Internal as B 
-import qualified Data.Text as T (null)
+import Data.ByteString.Char8 as B8
+import Data.Text (Text)
+import qualified Data.Text as T 
+import qualified Data.Text.Encoding as TE 
 import Data.Text.Encoding.Error ()
 import Data.Text.Internal (Text, textP)
 import Foreign.C.Types (CSize)
@@ -24,9 +33,226 @@ import Foreign.Storable (Storable, peek, poke)
 import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
 import GHC.Word (Word8, Word32)
 import qualified Data.Text.Array as A
-    
+import Data.Word (Word8, Word16)
+import System.IO.Unsafe (unsafePerformIO)
+import qualified Control.Exception as Exc
+import Data.Bits ((.&.), (.|.), shiftL)
+import Data.Typeable
+import Control.Arrow (first)
+import Data.Maybe (catMaybes)
 #include "pipes_text_cbits.h"
+-- | A specific character encoding.
+--
+-- Since 0.3.0
+data Codec = Codec
+  { codecName :: Text
+  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
+  , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
+  }
+instance Show Codec where
+    showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
+-- Since 0.3.0
+data TextException = DecodeException Codec Word8
+                   | EncodeException Codec Char
+                   | LengthExceeded Int
+                   | TextException Exc.SomeException
+    deriving (Show, Typeable)
+instance Exc.Exception TextException
+toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
+           -> (ByteString -> Decoding)
+toDecoding op = loop B.empty where
+  loop extra bs0 = case op (B.append extra bs0) of
+                    (txt, Right bs) -> Some txt bs (loop bs)
+                    (txt, Left (_,bs)) -> Other txt bs
+splitSlowly :: (ByteString -> Text)
+            -> ByteString 
+            -> (Text, Either (TextException, ByteString) ByteString)
+splitSlowly dec bytes = valid where
+    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
+    splits 0 = [(B.empty, bytes)]
+    splits n = B.splitAt n bytes : splits (n - 1)
+    decFirst (a, b) = case tryEvaluate (dec a) of
+        Left _ -> Nothing
+        Right text -> let trouble = case tryEvaluate (dec b) of
+                            Left exc -> Left (TextException exc, b)
+                            Right _  -> Right B.empty 
+                      in Just (text, trouble)
+                                      -- this case shouldn't occur, 
+                                      -- since splitSlowly is only called
+                                      -- when parsing failed somewhere
+utf8 :: Codec
+utf8 = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-8"
+    enc text = (TE.encodeUtf8 text, Nothing)
+    dec bytes = case decodeSomeUtf8 bytes of 
+      (t,b) -> (t, Right b)
+--     -- Whether the given byte is a continuation byte.
+--     isContinuation byte = byte .&. 0xC0 == 0x80
+-- 
+--     -- The number of continuation bytes needed by the given
+--     -- non-continuation byte. Returns -1 for an illegal UTF-8
+--     -- non-continuation byte and the whole split quickly must fail so
+--     -- as the input is passed to TE.decodeUtf8, which will issue a
+--     -- suitable error.
+--     required x0
+--         | x0 .&. 0x80 == 0x00 = 0
+--         | x0 .&. 0xE0 == 0xC0 = 1
+--         | x0 .&. 0xF0 == 0xE0 = 2
+--         | x0 .&. 0xF8 == 0xF0 = 3
+--         | otherwise           = -1
+-- 
+--     splitQuickly bytes
+--         | B.null l || req == -1 = Nothing
+--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
+--         | otherwise = Just (TE.decodeUtf8 l', r')
+--       where
+--         (l, r) = B.spanEnd isContinuation bytes
+--         req = required (B.last l)
+--         l' = B.init l
+--         r' = B.cons (B.last l) r
+-- |
+-- Since 0.3.0
+utf16_le :: Codec
+utf16_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-LE"
+    enc text = (TE.encodeUtf16LE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16LE bytes
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes n)
+                (B.index bytes (n + 1))
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
+-- |
+-- Since 0.3.0
+utf16_be :: Codec
+utf16_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-16-BE"
+    enc text = (TE.encodeUtf16BE text, Nothing)
+    dec bytes = case splitQuickly bytes of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf16BE bytes
+    splitQuickly bytes = maybeDecode (loop 0) where
+        maxN = B.length bytes
+        loop n |  n      == maxN = decodeAll
+               | (n + 1) == maxN = decodeTo n
+        loop n = let
+            req = utf16Required
+                (B.index bytes (n + 1))
+                (B.index bytes n)
+            decodeMore = loop $! n + req
+            in if n + req > maxN
+                then decodeTo n
+                else decodeMore
+        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
+        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
+utf16Required :: Word8 -> Word8 -> Int
+utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
+    x :: Word16
+    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
+-- |
+-- Since 0.3.0
+utf32_le :: Codec
+utf32_le = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-LE"
+    enc text = (TE.encodeUtf32LE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32LE bs
+-- |
+-- Since 0.3.0
+utf32_be :: Codec
+utf32_be = Codec name enc (toDecoding dec) where
+    name = T.pack "UTF-32-BE"
+    enc text = (TE.encodeUtf32BE text, Nothing)
+    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
+        Just (text, extra) -> (text, Right extra)
+        Nothing -> splitSlowly TE.decodeUtf32BE bs
+utf32SplitBytes :: (ByteString -> Text)
+                -> ByteString
+                -> Maybe (Text, ByteString)
+utf32SplitBytes dec bytes = split where
+    split = maybeDecode (dec toDecode, extra)
+    len = B.length bytes
+    lenExtra = mod len 4
+    lenToDecode = len - lenExtra
+    (toDecode, extra) = if lenExtra == 0
+        then (bytes, B.empty)
+        else B.splitAt lenToDecode bytes
+-- |
+-- Since 0.3.0
+ascii :: Codec
+ascii = Codec name enc (toDecoding dec) where
+    name = T.pack "ASCII"
+    enc text = (bytes, extra) where
+        (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
+        bytes = B8.pack (T.unpack safe)
+        extra = if T.null unsafe
+            then Nothing
+            else Just (EncodeException ascii (T.head unsafe), unsafe)
+    dec bytes = (text, extra) where
+        (safe, unsafe) = B.span (<= 0x7F) bytes
+        text = T.pack (B8.unpack safe)
+        extra = if B.null unsafe
+            then Right B.empty
+            else Left (DecodeException ascii (B.head unsafe), unsafe)
+-- |
+-- Since 0.3.0
+iso8859_1 :: Codec
+iso8859_1 = Codec name enc (toDecoding dec) where
+    name = T.pack "ISO-8859-1"
+    enc text = (bytes, extra) where
+        (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
+        bytes = B8.pack (T.unpack safe)
+        extra = if T.null unsafe
+            then Nothing
+            else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
+    dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
+tryEvaluate :: a -> Either Exc.SomeException a
+tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
+maybeDecode :: (a, b) -> Maybe (a, b)
+maybeDecode (a, b) = case tryEvaluate a of
+    Left _ -> Nothing
+    Right _ -> Just (a, b)
 -- | A stream oriented decoding result.
 data Decoding = Some Text ByteString (ByteString -> Decoding)
              | Other Text ByteString
@@ -103,36 +329,6 @@ decodeSomeUtf8 bs@(PS fp off len) = runST $ do
          return $! (chunkText, remaining)
 {-# INLINE decodeSomeUtf8 #-}
-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
-- decodeSomeUtf8 bs@(PS fp off len) = 
--                   runST $ do marray <- A.new (len+1) 
--                              unsafeIOToST (decodeChunkToBuffer marray)
--   
--      where
--      decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
--      decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
--        with (0::CSize)        $ \destOffPtr ->
--        with (0::CodePoint)    $ \codepointPtr ->
--        with (0::DecoderState) $ \statePtr ->
--        with nullPtr           $ \curPtrPtr ->
--          do let end = ptr `plusPtr` (off + len)
--                 curPtr = ptr `plusPtr` off
--             poke curPtrPtr curPtr
--             c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
--             state <- peek statePtr
--             lastPtr <- peek curPtrPtr
--             codepoint <- peek codepointPtr
--             n <- peek destOffPtr
--             chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
--                                            return $! textP arr 0 (fromIntegral n)
--             let left      = lastPtr `minusPtr` curPtr
--                 remaining = B.drop left bs
--             return $! (chunkText, remaining)
--      {-# INLINE decodeChunkToBuffer #-}
-- {-# INLINE decodeSomeUtf8 #-}
 mkText :: A.MArray s -> CSize -> IO Text
 mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
                                   return $! textP arr 0 (fromIntegral n)
diff --git a/Pipes/Text/Parse.hs b/Pipes/Text/Parse.hs
index ed0afa1..9cabaa6 100644
--- a/Pipes/Text/Parse.hs
+++ b/Pipes/Text/Parse.hs
@@ -44,16 +44,16 @@ nextChar = go
 {-| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the
    'Producer' is empty
 -}
-drawChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char)
+drawChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char)
 drawChar = do
    x <- PP.draw
    case x of
-        Left  r  -> return (Left r)
+        Nothing  -> return Nothing
-        Right txt -> case (T.uncons txt) of
+        Just txt -> case (T.uncons txt) of
            Nothing        -> drawChar
            Just (c, txt') -> do
                PP.unDraw txt'
-                return (Right c)
+                return (Just c)
 {-# INLINABLE drawChar #-}
 -- | Push back a 'Char' onto the underlying 'Producer'
@@ -71,12 +71,12 @@ unDrawChar c = modify (yield (T.singleton c) >>)
 >         Right c -> unDrawChar c
 >     return x
 -}
-peekChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char)
+peekChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char)
 peekChar = do
    x <- drawChar
    case x of
-        Left  _  -> return ()
+        Nothing  -> return ()
-        Right c -> unDrawChar c
+        Just c -> unDrawChar c
    return x
 {-# INLINABLE peekChar #-}
@@ -91,8 +91,8 @@ isEndOfChars :: (Monad m) => StateT (Producer Text m r) m Bool
 isEndOfChars = do
    x <- peekChar
    return (case x of
-        Left  _ -> True
+        Nothing -> True
-        Right _ -> False )
+        Just _-> False )
 {-# INLINABLE isEndOfChars #-}
 {-| @(take n)@ only allows @n@ characters to pass
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-01-25 21:42:54 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-01-25 21:42:54 -0500
commit	64e03122e6ecc4898cb1b193cdcf3b26d3e71b14 (patch)
tree	63b707950efcd92db00ac6979b792b9f30627e06 /Pipes/Text
parent	7ded3267a3b62ff896ea22262549f9511273c45f (diff)
download	text-pipes-64e03122e6ecc4898cb1b193cdcf3b26d3e71b14.tar.gz text-pipes-64e03122e6ecc4898cb1b193cdcf3b26d3e71b14.tar.zst text-pipes-64e03122e6ecc4898cb1b193cdcf3b26d3e71b14.zip

diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs index 7e5b044..76c2f4f 100644 --- a/Pipes/Text/Internal.hs +++ b/Pipes/Text/Internal.hs
@@ -1,5 +1,7 @@
1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,	1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2	UnliftedFFITypes #-}	2	{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
		3	{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
		4
3	-- This module lifts assorted materials from Brian O'Sullivan's text package	5	-- This module lifts assorted materials from Brian O'Sullivan's text package
4	-- especially Data.Text.Encoding in order to define a pipes-appropriate	6	-- especially Data.Text.Encoding in order to define a pipes-appropriate
5	-- streamDecodeUtf8	7	-- streamDecodeUtf8
@@ -7,13 +9,20 @@ module Pipes.Text.Internal
7	( Decoding(..)	9	( Decoding(..)
8	, streamDecodeUtf8	10	, streamDecodeUtf8
9	, decodeSomeUtf8	11	, decodeSomeUtf8
		12	, Codec(..)
		13	, TextException(..)
		14	, utf8
10	) where	15	) where
11	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)	16	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
12	import Control.Monad.ST (ST, runST)	17	import Control.Monad.ST (ST, runST)
13	import Data.Bits ((.&.))	18	import Data.Bits ((.&.))
14	import Data.ByteString as B	19	import Data.ByteString as B
		20	import Data.ByteString (ByteString)
15	import Data.ByteString.Internal as B	21	import Data.ByteString.Internal as B
16	import qualified Data.Text as T (null)	22	import Data.ByteString.Char8 as B8
		23	import Data.Text (Text)
		24	import qualified Data.Text as T
		25	import qualified Data.Text.Encoding as TE
17	import Data.Text.Encoding.Error ()	26	import Data.Text.Encoding.Error ()
18	import Data.Text.Internal (Text, textP)	27	import Data.Text.Internal (Text, textP)
19	import Foreign.C.Types (CSize)	28	import Foreign.C.Types (CSize)
@@ -24,9 +33,226 @@ import Foreign.Storable (Storable, peek, poke)
24	import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)	33	import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
25	import GHC.Word (Word8, Word32)	34	import GHC.Word (Word8, Word32)
26	import qualified Data.Text.Array as A	35	import qualified Data.Text.Array as A
27		36	import Data.Word (Word8, Word16)
		37	import System.IO.Unsafe (unsafePerformIO)
		38	import qualified Control.Exception as Exc
		39	import Data.Bits ((.&.), (.\|.), shiftL)
		40	import Data.Typeable
		41	import Control.Arrow (first)
		42	import Data.Maybe (catMaybes)
28	#include "pipes_text_cbits.h"	43	#include "pipes_text_cbits.h"
29		44
		45
		46	-- \| A specific character encoding.
		47	--
		48	-- Since 0.3.0
		49	data Codec = Codec
		50	{ codecName :: Text
		51	, codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
		52	, codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
		53	}
		54
		55	instance Show Codec where
		56	showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
		57
		58	-- Since 0.3.0
		59	data TextException = DecodeException Codec Word8
		60	\| EncodeException Codec Char
		61	\| LengthExceeded Int
		62	\| TextException Exc.SomeException
		63	deriving (Show, Typeable)
		64	instance Exc.Exception TextException
		65
		66	toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
		67	-> (ByteString -> Decoding)
		68	toDecoding op = loop B.empty where
		69	loop extra bs0 = case op (B.append extra bs0) of
		70	(txt, Right bs) -> Some txt bs (loop bs)
		71	(txt, Left (_,bs)) -> Other txt bs
		72
		73
		74	splitSlowly :: (ByteString -> Text)
		75	-> ByteString
		76	-> (Text, Either (TextException, ByteString) ByteString)
		77	splitSlowly dec bytes = valid where
		78	valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
		79	splits 0 = [(B.empty, bytes)]
		80	splits n = B.splitAt n bytes : splits (n - 1)
		81	decFirst (a, b) = case tryEvaluate (dec a) of
		82	Left _ -> Nothing
		83	Right text -> let trouble = case tryEvaluate (dec b) of
		84	Left exc -> Left (TextException exc, b)
		85	Right _ -> Right B.empty
		86	in Just (text, trouble)
		87	-- this case shouldn't occur,
		88	-- since splitSlowly is only called
		89	-- when parsing failed somewhere
		90
		91	utf8 :: Codec
		92	utf8 = Codec name enc (toDecoding dec) where
		93	name = T.pack "UTF-8"
		94	enc text = (TE.encodeUtf8 text, Nothing)
		95	dec bytes = case decodeSomeUtf8 bytes of
		96	(t,b) -> (t, Right b)
		97
		98	-- -- Whether the given byte is a continuation byte.
		99	-- isContinuation byte = byte .&. 0xC0 == 0x80
		100	--
		101	-- -- The number of continuation bytes needed by the given
		102	-- -- non-continuation byte. Returns -1 for an illegal UTF-8
		103	-- -- non-continuation byte and the whole split quickly must fail so
		104	-- -- as the input is passed to TE.decodeUtf8, which will issue a
		105	-- -- suitable error.
		106	-- required x0
		107	-- \| x0 .&. 0x80 == 0x00 = 0
		108	-- \| x0 .&. 0xE0 == 0xC0 = 1
		109	-- \| x0 .&. 0xF0 == 0xE0 = 2
		110	-- \| x0 .&. 0xF8 == 0xF0 = 3
		111	-- \| otherwise = -1
		112	--
		113	-- splitQuickly bytes
		114	-- \| B.null l \|\| req == -1 = Nothing
		115	-- \| req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
		116	-- \| otherwise = Just (TE.decodeUtf8 l', r')
		117	-- where
		118	-- (l, r) = B.spanEnd isContinuation bytes
		119	-- req = required (B.last l)
		120	-- l' = B.init l
		121	-- r' = B.cons (B.last l) r
		122
		123	-- \|
		124	-- Since 0.3.0
		125	utf16_le :: Codec
		126	utf16_le = Codec name enc (toDecoding dec) where
		127	name = T.pack "UTF-16-LE"
		128	enc text = (TE.encodeUtf16LE text, Nothing)
		129	dec bytes = case splitQuickly bytes of
		130	Just (text, extra) -> (text, Right extra)
		131	Nothing -> splitSlowly TE.decodeUtf16LE bytes
		132
		133	splitQuickly bytes = maybeDecode (loop 0) where
		134	maxN = B.length bytes
		135
		136	loop n \| n == maxN = decodeAll
		137	\| (n + 1) == maxN = decodeTo n
		138	loop n = let
		139	req = utf16Required
		140	(B.index bytes n)
		141	(B.index bytes (n + 1))
		142	decodeMore = loop $! n + req
		143	in if n + req > maxN
		144	then decodeTo n
		145	else decodeMore
		146
		147	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
		148	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
		149
		150	-- \|
		151	-- Since 0.3.0
		152	utf16_be :: Codec
		153	utf16_be = Codec name enc (toDecoding dec) where
		154	name = T.pack "UTF-16-BE"
		155	enc text = (TE.encodeUtf16BE text, Nothing)
		156	dec bytes = case splitQuickly bytes of
		157	Just (text, extra) -> (text, Right extra)
		158	Nothing -> splitSlowly TE.decodeUtf16BE bytes
		159
		160	splitQuickly bytes = maybeDecode (loop 0) where
		161	maxN = B.length bytes
		162
		163	loop n \| n == maxN = decodeAll
		164	\| (n + 1) == maxN = decodeTo n
		165	loop n = let
		166	req = utf16Required
		167	(B.index bytes (n + 1))
		168	(B.index bytes n)
		169	decodeMore = loop $! n + req
		170	in if n + req > maxN
		171	then decodeTo n
		172	else decodeMore
		173
		174	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
		175	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
		176
		177	utf16Required :: Word8 -> Word8 -> Int
		178	utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
		179	x :: Word16
		180	x = (fromIntegral x1 `shiftL` 8) .\|. fromIntegral x0
		181
		182	-- \|
		183	-- Since 0.3.0
		184	utf32_le :: Codec
		185	utf32_le = Codec name enc (toDecoding dec) where
		186	name = T.pack "UTF-32-LE"
		187	enc text = (TE.encodeUtf32LE text, Nothing)
		188	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
		189	Just (text, extra) -> (text, Right extra)
		190	Nothing -> splitSlowly TE.decodeUtf32LE bs
		191
		192	-- \|
		193	-- Since 0.3.0
		194	utf32_be :: Codec
		195	utf32_be = Codec name enc (toDecoding dec) where
		196	name = T.pack "UTF-32-BE"
		197	enc text = (TE.encodeUtf32BE text, Nothing)
		198	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
		199	Just (text, extra) -> (text, Right extra)
		200	Nothing -> splitSlowly TE.decodeUtf32BE bs
		201
		202	utf32SplitBytes :: (ByteString -> Text)
		203	-> ByteString
		204	-> Maybe (Text, ByteString)
		205	utf32SplitBytes dec bytes = split where
		206	split = maybeDecode (dec toDecode, extra)
		207	len = B.length bytes
		208	lenExtra = mod len 4
		209
		210	lenToDecode = len - lenExtra
		211	(toDecode, extra) = if lenExtra == 0
		212	then (bytes, B.empty)
		213	else B.splitAt lenToDecode bytes
		214
		215	-- \|
		216	-- Since 0.3.0
		217	ascii :: Codec
		218	ascii = Codec name enc (toDecoding dec) where
		219	name = T.pack "ASCII"
		220	enc text = (bytes, extra) where
		221	(safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
		222	bytes = B8.pack (T.unpack safe)
		223	extra = if T.null unsafe
		224	then Nothing
		225	else Just (EncodeException ascii (T.head unsafe), unsafe)
		226
		227	dec bytes = (text, extra) where
		228	(safe, unsafe) = B.span (<= 0x7F) bytes
		229	text = T.pack (B8.unpack safe)
		230	extra = if B.null unsafe
		231	then Right B.empty
		232	else Left (DecodeException ascii (B.head unsafe), unsafe)
		233
		234	-- \|
		235	-- Since 0.3.0
		236	iso8859_1 :: Codec
		237	iso8859_1 = Codec name enc (toDecoding dec) where
		238	name = T.pack "ISO-8859-1"
		239	enc text = (bytes, extra) where
		240	(safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
		241	bytes = B8.pack (T.unpack safe)
		242	extra = if T.null unsafe
		243	then Nothing
		244	else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
		245
		246	dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
		247
		248	tryEvaluate :: a -> Either Exc.SomeException a
		249	tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
		250
		251	maybeDecode :: (a, b) -> Maybe (a, b)
		252	maybeDecode (a, b) = case tryEvaluate a of
		253	Left _ -> Nothing
		254	Right _ -> Just (a, b)
		255
30	-- \| A stream oriented decoding result.	256	-- \| A stream oriented decoding result.
31	data Decoding = Some Text ByteString (ByteString -> Decoding)	257	data Decoding = Some Text ByteString (ByteString -> Decoding)
32	\| Other Text ByteString	258	\| Other Text ByteString
@@ -103,36 +329,6 @@ decodeSomeUtf8 bs@(PS fp off len) = runST $ do
103	return $! (chunkText, remaining)	329	return $! (chunkText, remaining)
104	{-# INLINE decodeSomeUtf8 #-}	330	{-# INLINE decodeSomeUtf8 #-}
105		331
106	-- decodeSomeUtf8 :: ByteString -> (Text, ByteString)
107	-- decodeSomeUtf8 bs@(PS fp off len) =
108	-- runST $ do marray <- A.new (len+1)
109	-- unsafeIOToST (decodeChunkToBuffer marray)
110	--
111	-- where
112	-- decodeChunkToBuffer :: A.MArray s -> IO (Text, ByteString)
113	-- decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
114	-- with (0::CSize) $ \destOffPtr ->
115	-- with (0::CodePoint) $ \codepointPtr ->
116	-- with (0::DecoderState) $ \statePtr ->
117	-- with nullPtr $ \curPtrPtr ->
118	-- do let end = ptr `plusPtr` (off + len)
119	-- curPtr = ptr `plusPtr` off
120	-- poke curPtrPtr curPtr
121	-- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
122	-- state <- peek statePtr
123	-- lastPtr <- peek curPtrPtr
124	-- codepoint <- peek codepointPtr
125	-- n <- peek destOffPtr
126	-- chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
127	-- return $! textP arr 0 (fromIntegral n)
128	-- let left = lastPtr `minusPtr` curPtr
129	-- remaining = B.drop left bs
130	-- return $! (chunkText, remaining)
131	-- {-# INLINE decodeChunkToBuffer #-}
132	-- {-# INLINE decodeSomeUtf8 #-}
133
134
135
136	mkText :: A.MArray s -> CSize -> IO Text	332	mkText :: A.MArray s -> CSize -> IO Text
137	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest	333	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
138	return $! textP arr 0 (fromIntegral n)	334	return $! textP arr 0 (fromIntegral n)


diff --git a/Pipes/Text/Parse.hs b/Pipes/Text/Parse.hs index ed0afa1..9cabaa6 100644 --- a/Pipes/Text/Parse.hs +++ b/Pipes/Text/Parse.hs
@@ -44,16 +44,16 @@ nextChar = go
44	{-\| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the	44	{-\| Draw one 'Char' from the underlying 'Producer', returning 'Left' if the
45	'Producer' is empty	45	'Producer' is empty
46	-}	46	-}
47	drawChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char)	47	drawChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char)
48	drawChar = do	48	drawChar = do
49	x <- PP.draw	49	x <- PP.draw
50	case x of	50	case x of
51	Left r -> return (Left r)	51	Nothing -> return Nothing
52	Right txt -> case (T.uncons txt) of	52	Just txt -> case (T.uncons txt) of
53	Nothing -> drawChar	53	Nothing -> drawChar
54	Just (c, txt') -> do	54	Just (c, txt') -> do
55	PP.unDraw txt'	55	PP.unDraw txt'
56	return (Right c)	56	return (Just c)
57	{-# INLINABLE drawChar #-}	57	{-# INLINABLE drawChar #-}
58		58
59	-- \| Push back a 'Char' onto the underlying 'Producer'	59	-- \| Push back a 'Char' onto the underlying 'Producer'
@@ -71,12 +71,12 @@ unDrawChar c = modify (yield (T.singleton c) >>)
71	> Right c -> unDrawChar c	71	> Right c -> unDrawChar c
72	> return x	72	> return x
73	-}	73	-}
74	peekChar :: (Monad m) => StateT (Producer Text m r) m (Either r Char)	74	peekChar :: (Monad m) => StateT (Producer Text m r) m (Maybe Char)
75	peekChar = do	75	peekChar = do
76	x <- drawChar	76	x <- drawChar
77	case x of	77	case x of
78	Left _ -> return ()	78	Nothing -> return ()
79	Right c -> unDrawChar c	79	Just c -> unDrawChar c
80	return x	80	return x
81	{-# INLINABLE peekChar #-}	81	{-# INLINABLE peekChar #-}
82		82
@@ -91,8 +91,8 @@ isEndOfChars :: (Monad m) => StateT (Producer Text m r) m Bool
91	isEndOfChars = do	91	isEndOfChars = do
92	x <- peekChar	92	x <- peekChar
93	return (case x of	93	return (case x of
94	Left _ -> True	94	Nothing -> True
95	Right _ -> False )	95	Just _-> False )
96	{-# INLINABLE isEndOfChars #-}	96	{-# INLINABLE isEndOfChars #-}
97		97
98	{-\| @(take n)@ only allows @n@ characters to pass	98	{-\| @(take n)@ only allows @n@ characters to pass