]>
Commit | Line | Data |
---|---|---|
1 | ||
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} | |
3 | -- | | |
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | |
5 | -- License: MIT | |
6 | -- | |
7 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. | |
8 | ||
9 | module Pipes.Text.Codec | |
10 | ( Decoding(..) | |
11 | , streamDecodeUtf8 | |
12 | , decodeSomeUtf8 | |
13 | , Codec(..) | |
14 | , TextException(..) | |
15 | , utf8 | |
16 | , utf16_le | |
17 | , utf16_be | |
18 | , utf32_le | |
19 | , utf32_be | |
20 | ) where | |
21 | ||
22 | import Data.Bits ((.&.)) | |
23 | import Data.Char (ord) | |
24 | import Data.ByteString as B | |
25 | import Data.ByteString (ByteString) | |
26 | import Data.ByteString.Internal as B | |
27 | import Data.ByteString.Char8 as B8 | |
28 | import Data.Text (Text) | |
29 | import qualified Data.Text as T | |
30 | import qualified Data.Text.Encoding as TE | |
31 | import Data.Text.Encoding.Error () | |
32 | import GHC.Word (Word8, Word32) | |
33 | import qualified Data.Text.Array as A | |
34 | import Data.Word (Word8, Word16) | |
35 | import System.IO.Unsafe (unsafePerformIO) | |
36 | import qualified Control.Exception as Exc | |
37 | import Data.Bits ((.&.), (.|.), shiftL) | |
38 | import Data.Typeable | |
39 | import Control.Arrow (first) | |
40 | import Data.Maybe (catMaybes) | |
41 | import Pipes.Text.Internal | |
42 | import Pipes | |
43 | -- | A specific character encoding. | |
44 | -- | |
45 | -- Since 0.3.0 | |
46 | data Codec = Codec | |
47 | { codecName :: Text | |
48 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | |
49 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | |
50 | } | |
51 | ||
52 | instance Show Codec where | |
53 | showsPrec d c = showParen (d > 10) $ | |
54 | showString "Codec " . shows (codecName c) | |
55 | ||
56 | data TextException = DecodeException Codec Word8 | |
57 | | EncodeException Codec Char | |
58 | | LengthExceeded Int | |
59 | | TextException Exc.SomeException | |
60 | deriving (Show, Typeable) | |
61 | instance Exc.Exception TextException | |
62 | ||
63 | ||
64 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | |
65 | -> (ByteString -> Decoding) | |
66 | toDecoding op = loop B.empty where | |
67 | loop !extra bs0 = case op (B.append extra bs0) of | |
68 | (txt, Right bs) -> Some txt bs (loop bs) | |
69 | (txt, Left (_,bs)) -> Other txt bs | |
70 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | |
71 | -- or else Codec changed to the conduit/enumerator definition. We have | |
72 | -- altered it to use 'streamDecodeUtf8' | |
73 | ||
74 | splitSlowly :: (ByteString -> Text) | |
75 | -> ByteString | |
76 | -> (Text, Either (TextException, ByteString) ByteString) | |
77 | splitSlowly dec bytes = valid where | |
78 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | |
79 | splits 0 = [(B.empty, bytes)] | |
80 | splits n = B.splitAt n bytes : splits (n - 1) | |
81 | decFirst (a, b) = case tryEvaluate (dec a) of | |
82 | Left _ -> Nothing | |
83 | Right text -> let trouble = case tryEvaluate (dec b) of | |
84 | Left exc -> Left (TextException exc, b) | |
85 | Right _ -> Right B.empty | |
86 | in Just (text, trouble) -- this case shouldn't occur, | |
87 | -- since splitSlowly is only called | |
88 | -- when parsing failed somewhere | |
89 | ||
90 | utf8 :: Codec | |
91 | utf8 = Codec name enc (toDecoding dec) where | |
92 | name = T.pack "UTF-8" | |
93 | enc text = (TE.encodeUtf8 text, Nothing) | |
94 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) | |
95 | ||
96 | -- -- Whether the given byte is a continuation byte. | |
97 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | |
98 | -- | |
99 | -- -- The number of continuation bytes needed by the given | |
100 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | |
101 | -- -- non-continuation byte and the whole split quickly must fail so | |
102 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | |
103 | -- -- suitable error. | |
104 | -- required x0 | |
105 | -- | x0 .&. 0x80 == 0x00 = 0 | |
106 | -- | x0 .&. 0xE0 == 0xC0 = 1 | |
107 | -- | x0 .&. 0xF0 == 0xE0 = 2 | |
108 | -- | x0 .&. 0xF8 == 0xF0 = 3 | |
109 | -- | otherwise = -1 | |
110 | -- | |
111 | -- splitQuickly bytes | |
112 | -- | B.null l || req == -1 = Nothing | |
113 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | |
114 | -- | otherwise = Just (TE.decodeUtf8 l', r') | |
115 | -- where | |
116 | -- (l, r) = B.spanEnd isContinuation bytes | |
117 | -- req = required (B.last l) | |
118 | -- l' = B.init l | |
119 | -- r' = B.cons (B.last l) r | |
120 | ||
121 | ||
122 | utf16_le :: Codec | |
123 | utf16_le = Codec name enc (toDecoding dec) where | |
124 | name = T.pack "UTF-16-LE" | |
125 | enc text = (TE.encodeUtf16LE text, Nothing) | |
126 | dec bytes = case splitQuickly bytes of | |
127 | Just (text, extra) -> (text, Right extra) | |
128 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | |
129 | ||
130 | splitQuickly bytes = maybeDecode (loop 0) where | |
131 | maxN = B.length bytes | |
132 | ||
133 | loop n | n == maxN = decodeAll | |
134 | | (n + 1) == maxN = decodeTo n | |
135 | loop n = let | |
136 | req = utf16Required | |
137 | (B.index bytes n) | |
138 | (B.index bytes (n + 1)) | |
139 | decodeMore = loop $! n + req | |
140 | in if n + req > maxN | |
141 | then decodeTo n | |
142 | else decodeMore | |
143 | ||
144 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | |
145 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | |
146 | ||
147 | utf16_be :: Codec | |
148 | utf16_be = Codec name enc (toDecoding dec) where | |
149 | name = T.pack "UTF-16-BE" | |
150 | enc text = (TE.encodeUtf16BE text, Nothing) | |
151 | dec bytes = case splitQuickly bytes of | |
152 | Just (text, extra) -> (text, Right extra) | |
153 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | |
154 | ||
155 | splitQuickly bytes = maybeDecode (loop 0) where | |
156 | maxN = B.length bytes | |
157 | ||
158 | loop n | n == maxN = decodeAll | |
159 | | (n + 1) == maxN = decodeTo n | |
160 | loop n = let | |
161 | req = utf16Required | |
162 | (B.index bytes (n + 1)) | |
163 | (B.index bytes n) | |
164 | decodeMore = loop $! n + req | |
165 | in if n + req > maxN | |
166 | then decodeTo n | |
167 | else decodeMore | |
168 | ||
169 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | |
170 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | |
171 | ||
172 | utf16Required :: Word8 -> Word8 -> Int | |
173 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | |
174 | x :: Word16 | |
175 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | |
176 | ||
177 | ||
178 | utf32_le :: Codec | |
179 | utf32_le = Codec name enc (toDecoding dec) where | |
180 | name = T.pack "UTF-32-LE" | |
181 | enc text = (TE.encodeUtf32LE text, Nothing) | |
182 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | |
183 | Just (text, extra) -> (text, Right extra) | |
184 | Nothing -> splitSlowly TE.decodeUtf32LE bs | |
185 | ||
186 | ||
187 | utf32_be :: Codec | |
188 | utf32_be = Codec name enc (toDecoding dec) where | |
189 | name = T.pack "UTF-32-BE" | |
190 | enc text = (TE.encodeUtf32BE text, Nothing) | |
191 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | |
192 | Just (text, extra) -> (text, Right extra) | |
193 | Nothing -> splitSlowly TE.decodeUtf32BE bs | |
194 | ||
195 | utf32SplitBytes :: (ByteString -> Text) | |
196 | -> ByteString | |
197 | -> Maybe (Text, ByteString) | |
198 | utf32SplitBytes dec bytes = split where | |
199 | split = maybeDecode (dec toDecode, extra) | |
200 | len = B.length bytes | |
201 | lenExtra = mod len 4 | |
202 | ||
203 | lenToDecode = len - lenExtra | |
204 | (toDecode, extra) = if lenExtra == 0 | |
205 | then (bytes, B.empty) | |
206 | else B.splitAt lenToDecode bytes | |
207 | ||
208 | ||
209 | tryEvaluate :: a -> Either Exc.SomeException a | |
210 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | |
211 | ||
212 | maybeDecode :: (a, b) -> Maybe (a, b) | |
213 | maybeDecode (a, b) = case tryEvaluate a of | |
214 | Left _ -> Nothing | |
215 | Right _ -> Just (a, b) |