aboutsummaryrefslogtreecommitdiffhomepage
path: root/Pipes/Text/Internal.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Pipes/Text/Internal.hs')
-rw-r--r--Pipes/Text/Internal.hs212
1 files changed, 0 insertions, 212 deletions
diff --git a/Pipes/Text/Internal.hs b/Pipes/Text/Internal.hs
index 76c2f4f..bcee278 100644
--- a/Pipes/Text/Internal.hs
+++ b/Pipes/Text/Internal.hs
@@ -9,9 +9,6 @@ module Pipes.Text.Internal
9 ( Decoding(..) 9 ( Decoding(..)
10 , streamDecodeUtf8 10 , streamDecodeUtf8
11 , decodeSomeUtf8 11 , decodeSomeUtf8
12 , Codec(..)
13 , TextException(..)
14 , utf8
15 ) where 12 ) where
16import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) 13import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
17import Control.Monad.ST (ST, runST) 14import Control.Monad.ST (ST, runST)
@@ -43,215 +40,6 @@ import Data.Maybe (catMaybes)
43#include "pipes_text_cbits.h" 40#include "pipes_text_cbits.h"
44 41
45 42
46-- | A specific character encoding.
47--
48-- Since 0.3.0
49data Codec = Codec
50 { codecName :: Text
51 , codecEncode :: Text -> (ByteString, Maybe (TextException, Text))
52 , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString)
53 }
54
55instance Show Codec where
56 showsPrec d c = showParen (d > 10) $ showString "Codec " . shows (codecName c)
57
58-- Since 0.3.0
59data TextException = DecodeException Codec Word8
60 | EncodeException Codec Char
61 | LengthExceeded Int
62 | TextException Exc.SomeException
63 deriving (Show, Typeable)
64instance Exc.Exception TextException
65
66toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString))
67 -> (ByteString -> Decoding)
68toDecoding op = loop B.empty where
69 loop extra bs0 = case op (B.append extra bs0) of
70 (txt, Right bs) -> Some txt bs (loop bs)
71 (txt, Left (_,bs)) -> Other txt bs
72
73
74splitSlowly :: (ByteString -> Text)
75 -> ByteString
76 -> (Text, Either (TextException, ByteString) ByteString)
77splitSlowly dec bytes = valid where
78 valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes)
79 splits 0 = [(B.empty, bytes)]
80 splits n = B.splitAt n bytes : splits (n - 1)
81 decFirst (a, b) = case tryEvaluate (dec a) of
82 Left _ -> Nothing
83 Right text -> let trouble = case tryEvaluate (dec b) of
84 Left exc -> Left (TextException exc, b)
85 Right _ -> Right B.empty
86 in Just (text, trouble)
87 -- this case shouldn't occur,
88 -- since splitSlowly is only called
89 -- when parsing failed somewhere
90
91utf8 :: Codec
92utf8 = Codec name enc (toDecoding dec) where
93 name = T.pack "UTF-8"
94 enc text = (TE.encodeUtf8 text, Nothing)
95 dec bytes = case decodeSomeUtf8 bytes of
96 (t,b) -> (t, Right b)
97
98-- -- Whether the given byte is a continuation byte.
99-- isContinuation byte = byte .&. 0xC0 == 0x80
100--
101-- -- The number of continuation bytes needed by the given
102-- -- non-continuation byte. Returns -1 for an illegal UTF-8
103-- -- non-continuation byte and the whole split quickly must fail so
104-- -- as the input is passed to TE.decodeUtf8, which will issue a
105-- -- suitable error.
106-- required x0
107-- | x0 .&. 0x80 == 0x00 = 0
108-- | x0 .&. 0xE0 == 0xC0 = 1
109-- | x0 .&. 0xF0 == 0xE0 = 2
110-- | x0 .&. 0xF8 == 0xF0 = 3
111-- | otherwise = -1
112--
113-- splitQuickly bytes
114-- | B.null l || req == -1 = Nothing
115-- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty)
116-- | otherwise = Just (TE.decodeUtf8 l', r')
117-- where
118-- (l, r) = B.spanEnd isContinuation bytes
119-- req = required (B.last l)
120-- l' = B.init l
121-- r' = B.cons (B.last l) r
122
123-- |
124-- Since 0.3.0
125utf16_le :: Codec
126utf16_le = Codec name enc (toDecoding dec) where
127 name = T.pack "UTF-16-LE"
128 enc text = (TE.encodeUtf16LE text, Nothing)
129 dec bytes = case splitQuickly bytes of
130 Just (text, extra) -> (text, Right extra)
131 Nothing -> splitSlowly TE.decodeUtf16LE bytes
132
133 splitQuickly bytes = maybeDecode (loop 0) where
134 maxN = B.length bytes
135
136 loop n | n == maxN = decodeAll
137 | (n + 1) == maxN = decodeTo n
138 loop n = let
139 req = utf16Required
140 (B.index bytes n)
141 (B.index bytes (n + 1))
142 decodeMore = loop $! n + req
143 in if n + req > maxN
144 then decodeTo n
145 else decodeMore
146
147 decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes)
148 decodeAll = (TE.decodeUtf16LE bytes, B.empty)
149
150-- |
151-- Since 0.3.0
152utf16_be :: Codec
153utf16_be = Codec name enc (toDecoding dec) where
154 name = T.pack "UTF-16-BE"
155 enc text = (TE.encodeUtf16BE text, Nothing)
156 dec bytes = case splitQuickly bytes of
157 Just (text, extra) -> (text, Right extra)
158 Nothing -> splitSlowly TE.decodeUtf16BE bytes
159
160 splitQuickly bytes = maybeDecode (loop 0) where
161 maxN = B.length bytes
162
163 loop n | n == maxN = decodeAll
164 | (n + 1) == maxN = decodeTo n
165 loop n = let
166 req = utf16Required
167 (B.index bytes (n + 1))
168 (B.index bytes n)
169 decodeMore = loop $! n + req
170 in if n + req > maxN
171 then decodeTo n
172 else decodeMore
173
174 decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes)
175 decodeAll = (TE.decodeUtf16BE bytes, B.empty)
176
177utf16Required :: Word8 -> Word8 -> Int
178utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where
179 x :: Word16
180 x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0
181
182-- |
183-- Since 0.3.0
184utf32_le :: Codec
185utf32_le = Codec name enc (toDecoding dec) where
186 name = T.pack "UTF-32-LE"
187 enc text = (TE.encodeUtf32LE text, Nothing)
188 dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of
189 Just (text, extra) -> (text, Right extra)
190 Nothing -> splitSlowly TE.decodeUtf32LE bs
191
192-- |
193-- Since 0.3.0
194utf32_be :: Codec
195utf32_be = Codec name enc (toDecoding dec) where
196 name = T.pack "UTF-32-BE"
197 enc text = (TE.encodeUtf32BE text, Nothing)
198 dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of
199 Just (text, extra) -> (text, Right extra)
200 Nothing -> splitSlowly TE.decodeUtf32BE bs
201
202utf32SplitBytes :: (ByteString -> Text)
203 -> ByteString
204 -> Maybe (Text, ByteString)
205utf32SplitBytes dec bytes = split where
206 split = maybeDecode (dec toDecode, extra)
207 len = B.length bytes
208 lenExtra = mod len 4
209
210 lenToDecode = len - lenExtra
211 (toDecode, extra) = if lenExtra == 0
212 then (bytes, B.empty)
213 else B.splitAt lenToDecode bytes
214
215-- |
216-- Since 0.3.0
217ascii :: Codec
218ascii = Codec name enc (toDecoding dec) where
219 name = T.pack "ASCII"
220 enc text = (bytes, extra) where
221 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
222 bytes = B8.pack (T.unpack safe)
223 extra = if T.null unsafe
224 then Nothing
225 else Just (EncodeException ascii (T.head unsafe), unsafe)
226
227 dec bytes = (text, extra) where
228 (safe, unsafe) = B.span (<= 0x7F) bytes
229 text = T.pack (B8.unpack safe)
230 extra = if B.null unsafe
231 then Right B.empty
232 else Left (DecodeException ascii (B.head unsafe), unsafe)
233
234-- |
235-- Since 0.3.0
236iso8859_1 :: Codec
237iso8859_1 = Codec name enc (toDecoding dec) where
238 name = T.pack "ISO-8859-1"
239 enc text = (bytes, extra) where
240 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
241 bytes = B8.pack (T.unpack safe)
242 extra = if T.null unsafe
243 then Nothing
244 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
245
246 dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
247
248tryEvaluate :: a -> Either Exc.SomeException a
249tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate
250
251maybeDecode :: (a, b) -> Maybe (a, b)
252maybeDecode (a, b) = case tryEvaluate a of
253 Left _ -> Nothing
254 Right _ -> Just (a, b)
255 43
256-- | A stream oriented decoding result. 44-- | A stream oriented decoding result.
257data Decoding = Some Text ByteString (ByteString -> Decoding) 45data Decoding = Some Text ByteString (ByteString -> Decoding)