use new text-stream-decoding

author: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-02-15 10:01:48 -0500
committer: michaelt <what_is_it_to_do_anything@yahoo.com> 2014-02-15 10:01:48 -0500
commit: bbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree: 5765f3f224708890cf50b400c675c23e759595a2 /Pipes/Text/Internal
parent: fd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
download: text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip
2 files changed, 0 insertions, 370 deletions
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
deleted file mode 100644
index 075a152..0000000
--- a/Pipes/Text/Internal/Codec.hs
+++ /dev/null
@@ -1,216 +0,0 @@
-{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
-- |
-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
-- License: MIT
--  This Parts of this code were taken from enumerator and conduits, and adapted for pipes
-- This module follows the model of the enumerator and conduits libraries, and defines
-- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and 
-- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
-- special functions appear in @Pipes.Text@
-module Pipes.Text.Internal.Codec
-    ( Codec(..)
-    , TextException(..)
-    , utf8
-    , utf16_le
-    , utf16_be
-    , utf32_le
-    , utf32_be
-    ) where
-import Data.Bits ((.&.))
-import Data.Char (ord)
-import Data.ByteString as B 
-import Data.ByteString (ByteString)
-import Data.ByteString.Internal as B 
-import Data.ByteString.Char8 as B8
-import Data.Text (Text)
-import qualified Data.Text as T 
-import qualified Data.Text.Encoding as TE 
-import Data.Text.Encoding.Error ()
-import GHC.Word (Word8, Word32)
-import qualified Data.Text.Array as A
-import Data.Word (Word8, Word16)
-import System.IO.Unsafe (unsafePerformIO)
-import qualified Control.Exception as Exc
-import Data.Bits ((.&.), (.|.), shiftL)
-import Data.Typeable
-import Control.Arrow (first)
-import Data.Maybe (catMaybes)
-import Pipes.Text.Internal.Decoding
-import Pipes
-- | A specific character encoding.
-data Codec = Codec
-  { codecName :: Text
-  , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
-  , codecDecode :: ByteString -> Decoding 
-  }
-instance Show Codec where
-    showsPrec d c = showParen (d > 10) $ 
-                    showString "Codec " . shows (codecName c)
-data TextException = DecodeException Codec Word8
-                   | EncodeException Codec Char
-                   | LengthExceeded Int
-                   | TextException Exc.SomeException
-    deriving (Show, Typeable)
-instance Exc.Exception TextException
-toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
-           -> (ByteString -> Decoding)
-toDecoding op = loop B.empty where
-  loop !extra bs0 = case op (B.append extra bs0) of
-                      (txt, Right bs) -> Some txt bs (loop bs)
-                      (txt, Left (_,bs)) -> Other txt bs
-- To do: toDecoding should be inlined in each of the 'Codec' definitions
-- or else Codec changed to the conduit/enumerator definition.  We have
-- altered it to use 'streamDecodeUtf8'
-splitSlowly :: (ByteString -> Text)
-            -> ByteString 
-            -> (Text, Either (TextException, ByteString) ByteString)
-splitSlowly dec bytes = valid where
-    valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
-    splits 0 = [(B.empty, bytes)]
-    splits n = B.splitAt n bytes : splits (n - 1)
-    decFirst (a, b) = case tryEvaluate (dec a) of
-        Left _ -> Nothing
-        Right text -> let trouble = case tryEvaluate (dec b) of
-                            Left exc -> Left (TextException exc, b)
-                            Right _  -> Right B.empty 
-                      in Just (text, trouble) -- this case shouldn't occur, 
-                                      -- since splitSlowly is only called
-                                      -- when parsing failed somewhere
-utf8 :: Codec
-utf8 = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-8"
-    enc text = (TE.encodeUtf8 text, Nothing)
-    dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
--     -- Whether the given byte is a continuation byte.
--     isContinuation byte = byte .&. 0xC0 == 0x80
-- 
--     -- The number of continuation bytes needed by the given
--     -- non-continuation byte. Returns -1 for an illegal UTF-8
--     -- non-continuation byte and the whole split quickly must fail so
--     -- as the input is passed to TE.decodeUtf8, which will issue a
--     -- suitable error.
--     required x0
--         | x0 .&. 0x80 == 0x00 = 0
--         | x0 .&. 0xE0 == 0xC0 = 1
--         | x0 .&. 0xF0 == 0xE0 = 2
--         | x0 .&. 0xF8 == 0xF0 = 3
--         | otherwise           = -1
-- 
--     splitQuickly bytes
--         | B.null l || req == -1 = Nothing
--         | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
--         | otherwise = Just (TE.decodeUtf8 l', r')
--       where
--         (l, r) = B.spanEnd isContinuation bytes
--         req = required (B.last l)
--         l' = B.init l
--         r' = B.cons (B.last l) r
-utf16_le :: Codec
-utf16_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-LE"
-    enc text = (TE.encodeUtf16LE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16LE bytes
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes n)
-                (B.index bytes (n + 1))
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-        decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16LE bytes, B.empty)
-utf16_be :: Codec
-utf16_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-16-BE"
-    enc text = (TE.encodeUtf16BE text, Nothing)
-    dec bytes = case splitQuickly bytes of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf16BE bytes
-    splitQuickly bytes = maybeDecode (loop 0) where
-        maxN = B.length bytes
-        loop n |  n      == maxN = decodeAll
-               | (n + 1) == maxN = decodeTo n
-        loop n = let
-            req = utf16Required
-                (B.index bytes (n + 1))
-                (B.index bytes n)
-            decodeMore = loop $! n + req
-            in if n + req > maxN
-                then decodeTo n
-                else decodeMore
-        decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
-        decodeAll = (TE.decodeUtf16BE bytes, B.empty)
-utf16Required :: Word8 -> Word8 -> Int
-utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
-    x :: Word16
-    x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
-utf32_le :: Codec
-utf32_le = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-LE"
-    enc text = (TE.encodeUtf32LE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32LE bs
-utf32_be :: Codec
-utf32_be = Codec name enc (toDecoding dec) where
-    name = T.pack "UTF-32-BE"
-    enc text = (TE.encodeUtf32BE text, Nothing)
-    dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
-        Just (text, extra) -> (text, Right extra)
-        Nothing -> splitSlowly TE.decodeUtf32BE bs
-utf32SplitBytes :: (ByteString -> Text)
-                -> ByteString
-                -> Maybe (Text, ByteString)
-utf32SplitBytes dec bytes = split where
-    split = maybeDecode (dec toDecode, extra)
-    len = B.length bytes
-    lenExtra = mod len 4
-    lenToDecode = len - lenExtra
-    (toDecode, extra) = if lenExtra == 0
-        then (bytes, B.empty)
-        else B.splitAt lenToDecode bytes
-tryEvaluate :: a -> Either Exc.SomeException a
-tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
-maybeDecode :: (a, b) -> Maybe (a, b)
-maybeDecode (a, b) = case tryEvaluate a of
-    Left _ -> Nothing
-    Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
deleted file mode 100644
index b5d928a..0000000
--- a/Pipes/Text/Internal/Decoding.hs
+++ /dev/null
@@ -1,154 +0,0 @@
-{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
-{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
-{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
-- This module lifts assorted materials from Brian O'Sullivan's text package 
-- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
-- 'streamDecodeUtf8'
-module Pipes.Text.Internal.Decoding 
-    ( Decoding(..)
-    , streamDecodeUtf8
-    , decodeSomeUtf8
-    ) where
-import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
-import Control.Monad.ST (ST, runST)
-import Data.Bits ((.&.))
-import Data.ByteString as B 
-import Data.ByteString (ByteString)
-import Data.ByteString.Internal as B 
-import Data.ByteString.Char8 as B8
-import Data.Text (Text)
-import qualified Data.Text as T 
-import qualified Data.Text.Encoding as TE 
-import Data.Text.Encoding.Error ()
-import Data.Text.Internal (Text, textP)
-import Foreign.C.Types (CSize)
-import Foreign.ForeignPtr (withForeignPtr)
-import Foreign.Marshal.Utils (with)
-import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
-import Foreign.Storable (Storable, peek, poke)
-import GHC.Base  (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
-import GHC.Word (Word8, Word32)
-import qualified Data.Text.Array as A
-import Data.Word (Word8, Word16)
-import System.IO.Unsafe (unsafePerformIO)
-import qualified Control.Exception as Exc
-import Data.Bits ((.&.), (.|.), shiftL)
-import Data.Typeable
-import Control.Arrow (first)
-import Data.Maybe (catMaybes)
-#include "pipes_text_cbits.h"
--  A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
-data Decoding = Some Text ByteString (ByteString -> Decoding) 
-               -- Text, continuation and any undecoded fragment.
-              | Other Text ByteString  
-              --  Text followed by an undecodable ByteString
-              
-instance Show Decoding where
-    showsPrec d (Some t bs _) = showParen (d > prec) $
-                                showString "Some " . showsPrec prec' t .
-                                showChar ' ' . showsPrec prec' bs .
-                                showString " _"
-      where prec = 10; prec' = prec + 1
-    showsPrec d (Other t bs)  = showParen (d > prec) $
-                                showString "Other " . showsPrec prec' t .
-                                showChar ' ' . showsPrec prec' bs .
-                                showString " _"
-      where prec = 10; prec' = prec + 1
-newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
-newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
--  Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'. 
-streamDecodeUtf8 :: ByteString -> Decoding
-streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0 
-  where
-  decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
-  decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) = 
-                    runST $ do marray <- A.new (len+1) 
-                               unsafeIOToST (decodeChunkToBuffer marray)
-     where
-     decodeChunkToBuffer :: A.MArray s -> IO Decoding
-     decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
-       with (0::CSize) $ \destOffPtr ->
-       with codepoint0 $ \codepointPtr ->
-       with state0     $ \statePtr ->
-       with nullPtr    $ \curPtrPtr ->
-         do let end = ptr `plusPtr` (off + len)
-                curPtr = ptr `plusPtr` off
-            poke curPtrPtr curPtr
-            c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
-            state <- peek statePtr
-            lastPtr <- peek curPtrPtr
-            codepoint <- peek codepointPtr
-            n <- peek destOffPtr
-            chunkText <- mkText dest n
-            let left      = lastPtr `minusPtr` curPtr
-                remaining = B.drop left bs
-                accum = if T.null chunkText then B.append old remaining  else remaining 
-            return $! case state of 
-              UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
-              _ ->           Some  chunkText accum (decodeChunkUtf8 accum codepoint state)
-     {-# INLINE decodeChunkToBuffer #-}
-  {-# INLINE decodeChunkUtf8 #-}
-{-# INLINE streamDecodeUtf8 #-}
--  Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
-decodeSomeUtf8 :: ByteString -> (Text, ByteString)
-decodeSomeUtf8 bs@(PS fp off len) = runST $ do 
-  dest <- A.new (len+1) 
-  unsafeIOToST $ 
-     withForeignPtr fp $ \ptr ->
-     with (0::CSize)        $ \destOffPtr ->
-     with (0::CodePoint)    $ \codepointPtr ->
-     with (0::DecoderState) $ \statePtr ->
-     with nullPtr           $ \curPtrPtr ->
-       do let end = ptr `plusPtr` (off + len)
-              curPtr = ptr `plusPtr` off
-          poke curPtrPtr curPtr
-          c_decode_utf8_with_state (A.maBA dest) destOffPtr 
-                                   curPtrPtr end codepointPtr statePtr
-          state <- peek statePtr
-          lastPtr <- peek curPtrPtr
-          codepoint <- peek codepointPtr
-          n <- peek destOffPtr
-          chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                                         return $! textP arr 0 (fromIntegral n)
-          let left      = lastPtr `minusPtr` curPtr
-              remaining = B.drop left bs
-          return $! (chunkText, remaining)
-{-# INLINE decodeSomeUtf8 #-}
-mkText :: A.MArray s -> CSize -> IO Text
-mkText dest n =  unsafeSTToIO $ do arr <- A.unsafeFreeze dest
-                                   return $! textP arr 0 (fromIntegral n)
-{-# INLINE mkText #-}
-ord :: Char -> Int
-ord (C# c#) = I# (ord# c#)
-{-# INLINE ord #-}
-unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
-unsafeWrite marr i c
-    | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
-                       return 1
-    | otherwise   = do A.unsafeWrite marr i lo
-                       A.unsafeWrite marr (i+1) hi
-                       return 2
-    where n = ord c
-          m = n - 0x10000
-          lo = fromIntegral $ (m `shiftR` 10) + 0xD800
-          hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
-          shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
-          {-# INLINE shiftR #-}
-{-# INLINE unsafeWrite #-}
-foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
-    :: MutableByteArray# s -> Ptr CSize
-    -> Ptr (Ptr Word8) -> Ptr Word8
-    -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
-\ No newline at end of file
author	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-02-15 10:01:48 -0500
committer	michaelt <what_is_it_to_do_anything@yahoo.com>	2014-02-15 10:01:48 -0500
commit	bbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree	5765f3f224708890cf50b400c675c23e759595a2 /Pipes/Text/Internal
parent	fd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
download	text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip

diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs deleted file mode 100644 index 075a152..0000000 --- a/Pipes/Text/Internal/Codec.hs +++ /dev/null
@@ -1,216 +0,0 @@
1
2	{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3	-- \|
4	-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5	-- License: MIT
6	-- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
7
8	-- This module follows the model of the enumerator and conduits libraries, and defines
9	-- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
10	-- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
11	-- special functions appear in @Pipes.Text@
12
13
14	module Pipes.Text.Internal.Codec
15	( Codec(..)
16	, TextException(..)
17	, utf8
18	, utf16_le
19	, utf16_be
20	, utf32_le
21	, utf32_be
22	) where
23
24	import Data.Bits ((.&.))
25	import Data.Char (ord)
26	import Data.ByteString as B
27	import Data.ByteString (ByteString)
28	import Data.ByteString.Internal as B
29	import Data.ByteString.Char8 as B8
30	import Data.Text (Text)
31	import qualified Data.Text as T
32	import qualified Data.Text.Encoding as TE
33	import Data.Text.Encoding.Error ()
34	import GHC.Word (Word8, Word32)
35	import qualified Data.Text.Array as A
36	import Data.Word (Word8, Word16)
37	import System.IO.Unsafe (unsafePerformIO)
38	import qualified Control.Exception as Exc
39	import Data.Bits ((.&.), (.\|.), shiftL)
40	import Data.Typeable
41	import Control.Arrow (first)
42	import Data.Maybe (catMaybes)
43	import Pipes.Text.Internal.Decoding
44	import Pipes
45	-- \| A specific character encoding.
46
47	data Codec = Codec
48	{ codecName :: Text
49	, codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
50	, codecDecode :: ByteString -> Decoding
51	}
52
53	instance Show Codec where
54	showsPrec d c = showParen (d > 10) $
55	showString "Codec " . shows (codecName c)
56
57	data TextException = DecodeException Codec Word8
58	\| EncodeException Codec Char
59	\| LengthExceeded Int
60	\| TextException Exc.SomeException
61	deriving (Show, Typeable)
62	instance Exc.Exception TextException
63
64
65	toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
66	-> (ByteString -> Decoding)
67	toDecoding op = loop B.empty where
68	loop !extra bs0 = case op (B.append extra bs0) of
69	(txt, Right bs) -> Some txt bs (loop bs)
70	(txt, Left (_,bs)) -> Other txt bs
71	-- To do: toDecoding should be inlined in each of the 'Codec' definitions
72	-- or else Codec changed to the conduit/enumerator definition. We have
73	-- altered it to use 'streamDecodeUtf8'
74
75	splitSlowly :: (ByteString -> Text)
76	-> ByteString
77	-> (Text, Either (TextException, ByteString) ByteString)
78	splitSlowly dec bytes = valid where
79	valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
80	splits 0 = [(B.empty, bytes)]
81	splits n = B.splitAt n bytes : splits (n - 1)
82	decFirst (a, b) = case tryEvaluate (dec a) of
83	Left _ -> Nothing
84	Right text -> let trouble = case tryEvaluate (dec b) of
85	Left exc -> Left (TextException exc, b)
86	Right _ -> Right B.empty
87	in Just (text, trouble) -- this case shouldn't occur,
88	-- since splitSlowly is only called
89	-- when parsing failed somewhere
90
91	utf8 :: Codec
92	utf8 = Codec name enc (toDecoding dec) where
93	name = T.pack "UTF-8"
94	enc text = (TE.encodeUtf8 text, Nothing)
95	dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
96
97	-- -- Whether the given byte is a continuation byte.
98	-- isContinuation byte = byte .&. 0xC0 == 0x80
99	--
100	-- -- The number of continuation bytes needed by the given
101	-- -- non-continuation byte. Returns -1 for an illegal UTF-8
102	-- -- non-continuation byte and the whole split quickly must fail so
103	-- -- as the input is passed to TE.decodeUtf8, which will issue a
104	-- -- suitable error.
105	-- required x0
106	-- \| x0 .&. 0x80 == 0x00 = 0
107	-- \| x0 .&. 0xE0 == 0xC0 = 1
108	-- \| x0 .&. 0xF0 == 0xE0 = 2
109	-- \| x0 .&. 0xF8 == 0xF0 = 3
110	-- \| otherwise = -1
111	--
112	-- splitQuickly bytes
113	-- \| B.null l \|\| req == -1 = Nothing
114	-- \| req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
115	-- \| otherwise = Just (TE.decodeUtf8 l', r')
116	-- where
117	-- (l, r) = B.spanEnd isContinuation bytes
118	-- req = required (B.last l)
119	-- l' = B.init l
120	-- r' = B.cons (B.last l) r
121
122
123	utf16_le :: Codec
124	utf16_le = Codec name enc (toDecoding dec) where
125	name = T.pack "UTF-16-LE"
126	enc text = (TE.encodeUtf16LE text, Nothing)
127	dec bytes = case splitQuickly bytes of
128	Just (text, extra) -> (text, Right extra)
129	Nothing -> splitSlowly TE.decodeUtf16LE bytes
130
131	splitQuickly bytes = maybeDecode (loop 0) where
132	maxN = B.length bytes
133
134	loop n \| n == maxN = decodeAll
135	\| (n + 1) == maxN = decodeTo n
136	loop n = let
137	req = utf16Required
138	(B.index bytes n)
139	(B.index bytes (n + 1))
140	decodeMore = loop $! n + req
141	in if n + req > maxN
142	then decodeTo n
143	else decodeMore
144
145	decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
146	decodeAll = (TE.decodeUtf16LE bytes, B.empty)
147
148	utf16_be :: Codec
149	utf16_be = Codec name enc (toDecoding dec) where
150	name = T.pack "UTF-16-BE"
151	enc text = (TE.encodeUtf16BE text, Nothing)
152	dec bytes = case splitQuickly bytes of
153	Just (text, extra) -> (text, Right extra)
154	Nothing -> splitSlowly TE.decodeUtf16BE bytes
155
156	splitQuickly bytes = maybeDecode (loop 0) where
157	maxN = B.length bytes
158
159	loop n \| n == maxN = decodeAll
160	\| (n + 1) == maxN = decodeTo n
161	loop n = let
162	req = utf16Required
163	(B.index bytes (n + 1))
164	(B.index bytes n)
165	decodeMore = loop $! n + req
166	in if n + req > maxN
167	then decodeTo n
168	else decodeMore
169
170	decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
171	decodeAll = (TE.decodeUtf16BE bytes, B.empty)
172
173	utf16Required :: Word8 -> Word8 -> Int
174	utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
175	x :: Word16
176	x = (fromIntegral x1 `shiftL` 8) .\|. fromIntegral x0
177
178
179	utf32_le :: Codec
180	utf32_le = Codec name enc (toDecoding dec) where
181	name = T.pack "UTF-32-LE"
182	enc text = (TE.encodeUtf32LE text, Nothing)
183	dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
184	Just (text, extra) -> (text, Right extra)
185	Nothing -> splitSlowly TE.decodeUtf32LE bs
186
187
188	utf32_be :: Codec
189	utf32_be = Codec name enc (toDecoding dec) where
190	name = T.pack "UTF-32-BE"
191	enc text = (TE.encodeUtf32BE text, Nothing)
192	dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
193	Just (text, extra) -> (text, Right extra)
194	Nothing -> splitSlowly TE.decodeUtf32BE bs
195
196	utf32SplitBytes :: (ByteString -> Text)
197	-> ByteString
198	-> Maybe (Text, ByteString)
199	utf32SplitBytes dec bytes = split where
200	split = maybeDecode (dec toDecode, extra)
201	len = B.length bytes
202	lenExtra = mod len 4
203
204	lenToDecode = len - lenExtra
205	(toDecode, extra) = if lenExtra == 0
206	then (bytes, B.empty)
207	else B.splitAt lenToDecode bytes
208
209
210	tryEvaluate :: a -> Either Exc.SomeException a
211	tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
212
213	maybeDecode :: (a, b) -> Maybe (a, b)
214	maybeDecode (a, b) = case tryEvaluate a of
215	Left _ -> Nothing
216	Right _ -> Just (a, b)


diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs deleted file mode 100644 index b5d928a..0000000 --- a/Pipes/Text/Internal/Decoding.hs +++ /dev/null
@@ -1,154 +0,0 @@
1	{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2	{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3	{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5	-- This module lifts assorted materials from Brian O'Sullivan's text package
6	-- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
7	-- 'streamDecodeUtf8'
8
9	module Pipes.Text.Internal.Decoding
10	( Decoding(..)
11	, streamDecodeUtf8
12	, decodeSomeUtf8
13	) where
14	import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15	import Control.Monad.ST (ST, runST)
16	import Data.Bits ((.&.))
17	import Data.ByteString as B
18	import Data.ByteString (ByteString)
19	import Data.ByteString.Internal as B
20	import Data.ByteString.Char8 as B8
21	import Data.Text (Text)
22	import qualified Data.Text as T
23	import qualified Data.Text.Encoding as TE
24	import Data.Text.Encoding.Error ()
25	import Data.Text.Internal (Text, textP)
26	import Foreign.C.Types (CSize)
27	import Foreign.ForeignPtr (withForeignPtr)
28	import Foreign.Marshal.Utils (with)
29	import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30	import Foreign.Storable (Storable, peek, poke)
31	import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
32	import GHC.Word (Word8, Word32)
33	import qualified Data.Text.Array as A
34	import Data.Word (Word8, Word16)
35	import System.IO.Unsafe (unsafePerformIO)
36	import qualified Control.Exception as Exc
37	import Data.Bits ((.&.), (.\|.), shiftL)
38	import Data.Typeable
39	import Control.Arrow (first)
40	import Data.Maybe (catMaybes)
41	#include "pipes_text_cbits.h"
42
43
44
45	-- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
46
47	data Decoding = Some Text ByteString (ByteString -> Decoding)
48	-- Text, continuation and any undecoded fragment.
49	\| Other Text ByteString
50	-- Text followed by an undecodable ByteString
51
52	instance Show Decoding where
53	showsPrec d (Some t bs _) = showParen (d > prec) $
54	showString "Some " . showsPrec prec' t .
55	showChar ' ' . showsPrec prec' bs .
56	showString " _"
57	where prec = 10; prec' = prec + 1
58	showsPrec d (Other t bs) = showParen (d > prec) $
59	showString "Other " . showsPrec prec' t .
60	showChar ' ' . showsPrec prec' bs .
61	showString " _"
62	where prec = 10; prec' = prec + 1
63
64	newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
65	newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
66
67	-- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'.
68	streamDecodeUtf8 :: ByteString -> Decoding
69	streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
70	where
71	decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
72	decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
73	runST $ do marray <- A.new (len+1)
74	unsafeIOToST (decodeChunkToBuffer marray)
75	where
76	decodeChunkToBuffer :: A.MArray s -> IO Decoding
77	decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
78	with (0::CSize) $ \destOffPtr ->
79	with codepoint0 $ \codepointPtr ->
80	with state0 $ \statePtr ->
81	with nullPtr $ \curPtrPtr ->
82	do let end = ptr `plusPtr` (off + len)
83	curPtr = ptr `plusPtr` off
84	poke curPtrPtr curPtr
85	c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
86	state <- peek statePtr
87	lastPtr <- peek curPtrPtr
88	codepoint <- peek codepointPtr
89	n <- peek destOffPtr
90	chunkText <- mkText dest n
91	let left = lastPtr `minusPtr` curPtr
92	remaining = B.drop left bs
93	accum = if T.null chunkText then B.append old remaining else remaining
94	return $! case state of
95	UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
96	_ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
97	{-# INLINE decodeChunkToBuffer #-}
98	{-# INLINE decodeChunkUtf8 #-}
99	{-# INLINE streamDecodeUtf8 #-}
100
101	-- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
102	decodeSomeUtf8 :: ByteString -> (Text, ByteString)
103	decodeSomeUtf8 bs@(PS fp off len) = runST $ do
104	dest <- A.new (len+1)
105	unsafeIOToST $
106	withForeignPtr fp $ \ptr ->
107	with (0::CSize) $ \destOffPtr ->
108	with (0::CodePoint) $ \codepointPtr ->
109	with (0::DecoderState) $ \statePtr ->
110	with nullPtr $ \curPtrPtr ->
111	do let end = ptr `plusPtr` (off + len)
112	curPtr = ptr `plusPtr` off
113	poke curPtrPtr curPtr
114	c_decode_utf8_with_state (A.maBA dest) destOffPtr
115	curPtrPtr end codepointPtr statePtr
116	state <- peek statePtr
117	lastPtr <- peek curPtrPtr
118	codepoint <- peek codepointPtr
119	n <- peek destOffPtr
120	chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
121	return $! textP arr 0 (fromIntegral n)
122	let left = lastPtr `minusPtr` curPtr
123	remaining = B.drop left bs
124	return $! (chunkText, remaining)
125	{-# INLINE decodeSomeUtf8 #-}
126
127	mkText :: A.MArray s -> CSize -> IO Text
128	mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
129	return $! textP arr 0 (fromIntegral n)
130	{-# INLINE mkText #-}
131
132	ord :: Char -> Int
133	ord (C# c#) = I# (ord# c#)
134	{-# INLINE ord #-}
135
136	unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
137	unsafeWrite marr i c
138	\| n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
139	return 1
140	\| otherwise = do A.unsafeWrite marr i lo
141	A.unsafeWrite marr (i+1) hi
142	return 2
143	where n = ord c
144	m = n - 0x10000
145	lo = fromIntegral $ (m `shiftR` 10) + 0xD800
146	hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
147	shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
148	{-# INLINE shiftR #-}
149	{-# INLINE unsafeWrite #-}
150
151	foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
152	:: MutableByteArray# s -> Ptr CSize
153	-> Ptr (Ptr Word8) -> Ptr Word8
154	-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file