aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text/Internal
diff options
context:
space:
mode:
authormichaelt <what_is_it_to_do_anything@yahoo.com>2014-02-15 10:01:48 -0500
committermichaelt <what_is_it_to_do_anything@yahoo.com>2014-02-15 10:01:48 -0500
commitbbdfd3056da4992e18d3983fd5992bee23af93af (patch)
tree5765f3f224708890cf50b400c675c23e759595a2 /Pipes/Text/Internal
parentfd27d0c43e2995437cf1f3e2d6f292597371ebb0 (diff)
downloadtext-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.gz
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.tar.zst
text-pipes-bbdfd3056da4992e18d3983fd5992bee23af93af.zip
use new text-stream-decoding
Diffstat (limited to 'Pipes/Text/Internal')
-rw-r--r--Pipes/Text/Internal/Codec.hs216
-rw-r--r--Pipes/Text/Internal/Decoding.hs154
2 files changed, 0 insertions, 370 deletions
diff --git a/Pipes/Text/Internal/Codec.hs b/Pipes/Text/Internal/Codec.hs
deleted file mode 100644
index 075a152..0000000
--- a/Pipes/Text/Internal/Codec.hs
+++ /dev/null
@@ -1,216 +0,0 @@
1
2{-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-}
3-- |
4-- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin
5-- License: MIT
6-- This Parts of this code were taken from enumerator and conduits, and adapted for pipes
7
8-- This module follows the model of the enumerator and conduits libraries, and defines
9-- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and
10-- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so
11-- special functions appear in @Pipes.Text@
12
13
14module Pipes.Text.Internal.Codec
15 ( Codec(..)
16 , TextException(..)
17 , utf8
18 , utf16_le
19 , utf16_be
20 , utf32_le
21 , utf32_be
22 ) where
23
24import Data.Bits ((.&.))
25import Data.Char (ord)
26import Data.ByteString as B
27import Data.ByteString (ByteString)
28import Data.ByteString.Internal as B
29import Data.ByteString.Char8 as B8
30import Data.Text (Text)
31import qualified Data.Text as T
32import qualified Data.Text.Encoding as TE
33import Data.Text.Encoding.Error ()
34import GHC.Word (Word8, Word32)
35import qualified Data.Text.Array as A
36import Data.Word (Word8, Word16)
37import System.IO.Unsafe (unsafePerformIO)
38import qualified Control.Exception as Exc
39import Data.Bits ((.&.), (.|.), shiftL)
40import Data.Typeable
41import Control.Arrow (first)
42import Data.Maybe (catMaybes)
43import Pipes.Text.Internal.Decoding
44import Pipes
45-- | A specific character encoding.
46
47data Codec = Codec
48 { codecName :: Text
49 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
50 , codecDecode :: ByteString -> Decoding
51 }
52
53instance Show Codec where
54 showsPrec d c = showParen (d > 10) $
55 showString "Codec " . shows (codecName c)
56
57data TextException = DecodeException Codec Word8
58 | EncodeException Codec Char
59 | LengthExceeded Int
60 | TextException Exc.SomeException
61 deriving (Show, Typeable)
62instance Exc.Exception TextException
63
64
65toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
66 -> (ByteString -> Decoding)
67toDecoding op = loop B.empty where
68 loop !extra bs0 = case op (B.append extra bs0) of
69 (txt, Right bs) -> Some txt bs (loop bs)
70 (txt, Left (_,bs)) -> Other txt bs
71-- To do: toDecoding should be inlined in each of the 'Codec' definitions
72-- or else Codec changed to the conduit/enumerator definition. We have
73-- altered it to use 'streamDecodeUtf8'
74
75splitSlowly :: (ByteString -> Text)
76 -> ByteString
77 -> (Text, Either (TextException, ByteString) ByteString)
78splitSlowly dec bytes = valid where
79 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
80 splits 0 = [(B.empty, bytes)]
81 splits n = B.splitAt n bytes : splits (n - 1)
82 decFirst (a, b) = case tryEvaluate (dec a) of
83 Left _ -> Nothing
84 Right text -> let trouble = case tryEvaluate (dec b) of
85 Left exc -> Left (TextException exc, b)
86 Right _ -> Right B.empty
87 in Just (text, trouble) -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b)
96
97-- -- Whether the given byte is a continuation byte.
98-- isContinuation byte = byte .&. 0xC0 == 0x80
99--
100-- -- The number of continuation bytes needed by the given
101-- -- non-continuation byte. Returns -1 for an illegal UTF-8
102-- -- non-continuation byte and the whole split quickly must fail so
103-- -- as the input is passed to TE.decodeUtf8, which will issue a
104-- -- suitable error.
105-- required x0
106-- | x0 .&. 0x80 == 0x00 = 0
107-- | x0 .&. 0xE0 == 0xC0 = 1
108-- | x0 .&. 0xF0 == 0xE0 = 2
109-- | x0 .&. 0xF8 == 0xF0 = 3
110-- | otherwise = -1
111--
112-- splitQuickly bytes
113-- | B.null l || req == -1 = Nothing
114-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
115-- | otherwise = Just (TE.decodeUtf8 l', r')
116-- where
117-- (l, r) = B.spanEnd isContinuation bytes
118-- req = required (B.last l)
119-- l' = B.init l
120-- r' = B.cons (B.last l) r
121
122
123utf16_le :: Codec
124utf16_le = Codec name enc (toDecoding dec) where
125 name = T.pack "UTF-16-LE"
126 enc text = (TE.encodeUtf16LE text, Nothing)
127 dec bytes = case splitQuickly bytes of
128 Just (text, extra) -> (text, Right extra)
129 Nothing -> splitSlowly TE.decodeUtf16LE bytes
130
131 splitQuickly bytes = maybeDecode (loop 0) where
132 maxN = B.length bytes
133
134 loop n | n == maxN = decodeAll
135 | (n + 1) == maxN = decodeTo n
136 loop n = let
137 req = utf16Required
138 (B.index bytes n)
139 (B.index bytes (n + 1))
140 decodeMore = loop $! n + req
141 in if n + req > maxN
142 then decodeTo n
143 else decodeMore
144
145 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
146 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
147
148utf16_be :: Codec
149utf16_be = Codec name enc (toDecoding dec) where
150 name = T.pack "UTF-16-BE"
151 enc text = (TE.encodeUtf16BE text, Nothing)
152 dec bytes = case splitQuickly bytes of
153 Just (text, extra) -> (text, Right extra)
154 Nothing -> splitSlowly TE.decodeUtf16BE bytes
155
156 splitQuickly bytes = maybeDecode (loop 0) where
157 maxN = B.length bytes
158
159 loop n | n == maxN = decodeAll
160 | (n + 1) == maxN = decodeTo n
161 loop n = let
162 req = utf16Required
163 (B.index bytes (n + 1))
164 (B.index bytes n)
165 decodeMore = loop $! n + req
166 in if n + req > maxN
167 then decodeTo n
168 else decodeMore
169
170 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
171 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
172
173utf16Required :: Word8 -> Word8 -> Int
174utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
175 x :: Word16
176 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
177
178
179utf32_le :: Codec
180utf32_le = Codec name enc (toDecoding dec) where
181 name = T.pack "UTF-32-LE"
182 enc text = (TE.encodeUtf32LE text, Nothing)
183 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
184 Just (text, extra) -> (text, Right extra)
185 Nothing -> splitSlowly TE.decodeUtf32LE bs
186
187
188utf32_be :: Codec
189utf32_be = Codec name enc (toDecoding dec) where
190 name = T.pack "UTF-32-BE"
191 enc text = (TE.encodeUtf32BE text, Nothing)
192 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
193 Just (text, extra) -> (text, Right extra)
194 Nothing -> splitSlowly TE.decodeUtf32BE bs
195
196utf32SplitBytes :: (ByteString -> Text)
197 -> ByteString
198 -> Maybe (Text, ByteString)
199utf32SplitBytes dec bytes = split where
200 split = maybeDecode (dec toDecode, extra)
201 len = B.length bytes
202 lenExtra = mod len 4
203
204 lenToDecode = len - lenExtra
205 (toDecode, extra) = if lenExtra == 0
206 then (bytes, B.empty)
207 else B.splitAt lenToDecode bytes
208
209
210tryEvaluate :: a -> Either Exc.SomeException a
211tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
212
213maybeDecode :: (a, b) -> Maybe (a, b)
214maybeDecode (a, b) = case tryEvaluate a of
215 Left _ -> Nothing
216 Right _ -> Just (a, b)
diff --git a/Pipes/Text/Internal/Decoding.hs b/Pipes/Text/Internal/Decoding.hs
deleted file mode 100644
index b5d928a..0000000
--- a/Pipes/Text/Internal/Decoding.hs
+++ /dev/null
@@ -1,154 +0,0 @@
1{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface #-}
2{-# LANGUAGE GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-}
3{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
4
5-- This module lifts assorted materials from Brian O'Sullivan's text package
6-- especially @Data.Text.Encoding@ in order to define a pipes-appropriate
7-- 'streamDecodeUtf8'
8
9module Pipes.Text.Internal.Decoding
10 ( Decoding(..)
11 , streamDecodeUtf8
12 , decodeSomeUtf8
13 ) where
14import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
15import Control.Monad.ST (ST, runST)
16import Data.Bits ((.&.))
17import Data.ByteString as B
18import Data.ByteString (ByteString)
19import Data.ByteString.Internal as B
20import Data.ByteString.Char8 as B8
21import Data.Text (Text)
22import qualified Data.Text as T
23import qualified Data.Text.Encoding as TE
24import Data.Text.Encoding.Error ()
25import Data.Text.Internal (Text, textP)
26import Foreign.C.Types (CSize)
27import Foreign.ForeignPtr (withForeignPtr)
28import Foreign.Marshal.Utils (with)
29import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
30import Foreign.Storable (Storable, peek, poke)
31import GHC.Base (Char(..), Int(..), MutableByteArray#, ord#, iShiftRA#)
32import GHC.Word (Word8, Word32)
33import qualified Data.Text.Array as A
34import Data.Word (Word8, Word16)
35import System.IO.Unsafe (unsafePerformIO)
36import qualified Control.Exception as Exc
37import Data.Bits ((.&.), (.|.), shiftL)
38import Data.Typeable
39import Control.Arrow (first)
40import Data.Maybe (catMaybes)
41#include "pipes_text_cbits.h"
42
43
44
45-- A stream oriented decoding result. Distinct from the similar type in Data.Text.Encoding
46
47data Decoding = Some Text ByteString (ByteString -> Decoding)
48 -- Text, continuation and any undecoded fragment.
49 | Other Text ByteString
50 -- Text followed by an undecodable ByteString
51
52instance Show Decoding where
53 showsPrec d (Some t bs _) = showParen (d > prec) $
54 showString "Some " . showsPrec prec' t .
55 showChar ' ' . showsPrec prec' bs .
56 showString " _"
57 where prec = 10; prec' = prec + 1
58 showsPrec d (Other t bs) = showParen (d > prec) $
59 showString "Other " . showsPrec prec' t .
60 showChar ' ' . showsPrec prec' bs .
61 showString " _"
62 where prec = 10; prec' = prec + 1
63
64newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
65newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
66
67-- Resolve a 'ByteString' into 'Text' and a continuation that can handle further 'ByteStrings'.
68streamDecodeUtf8 :: ByteString -> Decoding
69streamDecodeUtf8 = decodeChunkUtf8 B.empty 0 0
70 where
71 decodeChunkUtf8 :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding
72 decodeChunkUtf8 old codepoint0 state0 bs@(PS fp off len) =
73 runST $ do marray <- A.new (len+1)
74 unsafeIOToST (decodeChunkToBuffer marray)
75 where
76 decodeChunkToBuffer :: A.MArray s -> IO Decoding
77 decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
78 with (0::CSize) $ \destOffPtr ->
79 with codepoint0 $ \codepointPtr ->
80 with state0 $ \statePtr ->
81 with nullPtr $ \curPtrPtr ->
82 do let end = ptr `plusPtr` (off + len)
83 curPtr = ptr `plusPtr` off
84 poke curPtrPtr curPtr
85 c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtrPtr end codepointPtr statePtr
86 state <- peek statePtr
87 lastPtr <- peek curPtrPtr
88 codepoint <- peek codepointPtr
89 n <- peek destOffPtr
90 chunkText <- mkText dest n
91 let left = lastPtr `minusPtr` curPtr
92 remaining = B.drop left bs
93 accum = if T.null chunkText then B.append old remaining else remaining
94 return $! case state of
95 UTF8_REJECT -> Other chunkText accum -- We encountered an encoding error
96 _ -> Some chunkText accum (decodeChunkUtf8 accum codepoint state)
97 {-# INLINE decodeChunkToBuffer #-}
98 {-# INLINE decodeChunkUtf8 #-}
99{-# INLINE streamDecodeUtf8 #-}
100
101-- Resolve a ByteString into an initial segment of intelligible 'Text' and whatever is unintelligble
102decodeSomeUtf8 :: ByteString -> (Text, ByteString)
103decodeSomeUtf8 bs@(PS fp off len) = runST $ do
104 dest <- A.new (len+1)
105 unsafeIOToST $
106 withForeignPtr fp $ \ptr ->
107 with (0::CSize) $ \destOffPtr ->
108 with (0::CodePoint) $ \codepointPtr ->
109 with (0::DecoderState) $ \statePtr ->
110 with nullPtr $ \curPtrPtr ->
111 do let end = ptr `plusPtr` (off + len)
112 curPtr = ptr `plusPtr` off
113 poke curPtrPtr curPtr
114 c_decode_utf8_with_state (A.maBA dest) destOffPtr
115 curPtrPtr end codepointPtr statePtr
116 state <- peek statePtr
117 lastPtr <- peek curPtrPtr
118 codepoint <- peek codepointPtr
119 n <- peek destOffPtr
120 chunkText <- unsafeSTToIO $ do arr <- A.unsafeFreeze dest
121 return $! textP arr 0 (fromIntegral n)
122 let left = lastPtr `minusPtr` curPtr
123 remaining = B.drop left bs
124 return $! (chunkText, remaining)
125{-# INLINE decodeSomeUtf8 #-}
126
127mkText :: A.MArray s -> CSize -> IO Text
128mkText dest n = unsafeSTToIO $ do arr <- A.unsafeFreeze dest
129 return $! textP arr 0 (fromIntegral n)
130{-# INLINE mkText #-}
131
132ord :: Char -> Int
133ord (C# c#) = I# (ord# c#)
134{-# INLINE ord #-}
135
136unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
137unsafeWrite marr i c
138 | n < 0x10000 = do A.unsafeWrite marr i (fromIntegral n)
139 return 1
140 | otherwise = do A.unsafeWrite marr i lo
141 A.unsafeWrite marr (i+1) hi
142 return 2
143 where n = ord c
144 m = n - 0x10000
145 lo = fromIntegral $ (m `shiftR` 10) + 0xD800
146 hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
147 shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
148 {-# INLINE shiftR #-}
149{-# INLINE unsafeWrite #-}
150
151foreign import ccall unsafe "_hs_pipes_text_decode_utf8_state" c_decode_utf8_with_state
152 :: MutableByteArray# s -> Ptr CSize
153 -> Ptr (Ptr Word8) -> Ptr Word8
154 -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) \ No newline at end of file