]>
Commit | Line | Data |
---|---|---|
4cbc92cc | 1 | |
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes #-} | |
3 | -- | | |
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | |
5 | -- License: MIT | |
6 | -- | |
7 | -- Handle streams of text. | |
8 | -- | |
9 | -- Parts of this code were taken from enumerator and conduits, and adapted for pipes. | |
10 | ||
11 | module Pipes.Text.Codec | |
12 | ( Decoding(..) | |
13 | , streamDecodeUtf8 | |
14 | , decodeSomeUtf8 | |
15 | , Codec(..) | |
16 | , TextException(..) | |
17 | , utf8 | |
18 | ) where | |
19 | ||
20 | import Data.Bits ((.&.)) | |
21 | import Data.Char (ord) | |
22 | import Data.ByteString as B | |
23 | import Data.ByteString (ByteString) | |
24 | import Data.ByteString.Internal as B | |
25 | import Data.ByteString.Char8 as B8 | |
26 | import Data.Text (Text) | |
27 | import qualified Data.Text as T | |
28 | import qualified Data.Text.Encoding as TE | |
29 | import Data.Text.Encoding.Error () | |
30 | import GHC.Word (Word8, Word32) | |
31 | import qualified Data.Text.Array as A | |
32 | import Data.Word (Word8, Word16) | |
33 | import System.IO.Unsafe (unsafePerformIO) | |
34 | import qualified Control.Exception as Exc | |
35 | import Data.Bits ((.&.), (.|.), shiftL) | |
36 | import Data.Typeable | |
37 | import Control.Arrow (first) | |
38 | import Data.Maybe (catMaybes) | |
39 | import Pipes.Text.Internal | |
40 | ||
41 | -- | A specific character encoding. | |
42 | -- | |
43 | -- Since 0.3.0 | |
44 | data Codec = Codec | |
45 | { codecName :: Text | |
46 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | |
47 | , codecDecode :: ByteString -> Decoding -- (Text, Either (TextException, ByteString) ByteString) | |
48 | } | |
49 | ||
50 | instance Show Codec where | |
51 | showsPrec d c = showParen (d > 10) $ | |
52 | showString "Codec " . shows (codecName c) | |
53 | ||
54 | data TextException = DecodeException Codec Word8 | |
55 | | EncodeException Codec Char | |
56 | | LengthExceeded Int | |
57 | | TextException Exc.SomeException | |
58 | deriving (Show, Typeable) | |
59 | instance Exc.Exception TextException | |
60 | ||
61 | ||
62 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | |
63 | -> (ByteString -> Decoding) | |
64 | toDecoding op = loop B.empty where | |
65 | loop extra bs0 = case op (B.append extra bs0) of | |
66 | (txt, Right bs) -> Some txt bs (loop bs) | |
67 | (txt, Left (_,bs)) -> Other txt bs | |
68 | ||
69 | ||
70 | splitSlowly :: (ByteString -> Text) | |
71 | -> ByteString | |
72 | -> (Text, Either (TextException, ByteString) ByteString) | |
73 | splitSlowly dec bytes = valid where | |
74 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | |
75 | splits 0 = [(B.empty, bytes)] | |
76 | splits n = B.splitAt n bytes : splits (n - 1) | |
77 | decFirst (a, b) = case tryEvaluate (dec a) of | |
78 | Left _ -> Nothing | |
79 | Right text -> let trouble = case tryEvaluate (dec b) of | |
80 | Left exc -> Left (TextException exc, b) | |
81 | Right _ -> Right B.empty | |
82 | in Just (text, trouble) -- this case shouldn't occur, | |
83 | -- since splitSlowly is only called | |
84 | -- when parsing failed somewhere | |
85 | ||
86 | utf8 :: Codec | |
87 | utf8 = Codec name enc (toDecoding dec) where | |
88 | name = T.pack "UTF-8" | |
89 | enc text = (TE.encodeUtf8 text, Nothing) | |
90 | dec bytes = case decodeSomeUtf8 bytes of | |
91 | (t,b) -> (t, Right b) | |
92 | ||
93 | -- -- Whether the given byte is a continuation byte. | |
94 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | |
95 | -- | |
96 | -- -- The number of continuation bytes needed by the given | |
97 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | |
98 | -- -- non-continuation byte and the whole split quickly must fail so | |
99 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | |
100 | -- -- suitable error. | |
101 | -- required x0 | |
102 | -- | x0 .&. 0x80 == 0x00 = 0 | |
103 | -- | x0 .&. 0xE0 == 0xC0 = 1 | |
104 | -- | x0 .&. 0xF0 == 0xE0 = 2 | |
105 | -- | x0 .&. 0xF8 == 0xF0 = 3 | |
106 | -- | otherwise = -1 | |
107 | -- | |
108 | -- splitQuickly bytes | |
109 | -- | B.null l || req == -1 = Nothing | |
110 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | |
111 | -- | otherwise = Just (TE.decodeUtf8 l', r') | |
112 | -- where | |
113 | -- (l, r) = B.spanEnd isContinuation bytes | |
114 | -- req = required (B.last l) | |
115 | -- l' = B.init l | |
116 | -- r' = B.cons (B.last l) r | |
117 | ||
118 | ||
119 | utf16_le :: Codec | |
120 | utf16_le = Codec name enc (toDecoding dec) where | |
121 | name = T.pack "UTF-16-LE" | |
122 | enc text = (TE.encodeUtf16LE text, Nothing) | |
123 | dec bytes = case splitQuickly bytes of | |
124 | Just (text, extra) -> (text, Right extra) | |
125 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | |
126 | ||
127 | splitQuickly bytes = maybeDecode (loop 0) where | |
128 | maxN = B.length bytes | |
129 | ||
130 | loop n | n == maxN = decodeAll | |
131 | | (n + 1) == maxN = decodeTo n | |
132 | loop n = let | |
133 | req = utf16Required | |
134 | (B.index bytes n) | |
135 | (B.index bytes (n + 1)) | |
136 | decodeMore = loop $! n + req | |
137 | in if n + req > maxN | |
138 | then decodeTo n | |
139 | else decodeMore | |
140 | ||
141 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | |
142 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | |
143 | ||
144 | utf16_be :: Codec | |
145 | utf16_be = Codec name enc (toDecoding dec) where | |
146 | name = T.pack "UTF-16-BE" | |
147 | enc text = (TE.encodeUtf16BE text, Nothing) | |
148 | dec bytes = case splitQuickly bytes of | |
149 | Just (text, extra) -> (text, Right extra) | |
150 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | |
151 | ||
152 | splitQuickly bytes = maybeDecode (loop 0) where | |
153 | maxN = B.length bytes | |
154 | ||
155 | loop n | n == maxN = decodeAll | |
156 | | (n + 1) == maxN = decodeTo n | |
157 | loop n = let | |
158 | req = utf16Required | |
159 | (B.index bytes (n + 1)) | |
160 | (B.index bytes n) | |
161 | decodeMore = loop $! n + req | |
162 | in if n + req > maxN | |
163 | then decodeTo n | |
164 | else decodeMore | |
165 | ||
166 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | |
167 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | |
168 | ||
169 | utf16Required :: Word8 -> Word8 -> Int | |
170 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | |
171 | x :: Word16 | |
172 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | |
173 | ||
174 | ||
175 | utf32_le :: Codec | |
176 | utf32_le = Codec name enc (toDecoding dec) where | |
177 | name = T.pack "UTF-32-LE" | |
178 | enc text = (TE.encodeUtf32LE text, Nothing) | |
179 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | |
180 | Just (text, extra) -> (text, Right extra) | |
181 | Nothing -> splitSlowly TE.decodeUtf32LE bs | |
182 | ||
183 | ||
184 | utf32_be :: Codec | |
185 | utf32_be = Codec name enc (toDecoding dec) where | |
186 | name = T.pack "UTF-32-BE" | |
187 | enc text = (TE.encodeUtf32BE text, Nothing) | |
188 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | |
189 | Just (text, extra) -> (text, Right extra) | |
190 | Nothing -> splitSlowly TE.decodeUtf32BE bs | |
191 | ||
192 | utf32SplitBytes :: (ByteString -> Text) | |
193 | -> ByteString | |
194 | -> Maybe (Text, ByteString) | |
195 | utf32SplitBytes dec bytes = split where | |
196 | split = maybeDecode (dec toDecode, extra) | |
197 | len = B.length bytes | |
198 | lenExtra = mod len 4 | |
199 | ||
200 | lenToDecode = len - lenExtra | |
201 | (toDecode, extra) = if lenExtra == 0 | |
202 | then (bytes, B.empty) | |
203 | else B.splitAt lenToDecode bytes | |
204 | ||
205 | ascii :: Codec | |
206 | ascii = Codec name enc (toDecoding dec) where | |
207 | name = T.pack "ASCII" | |
208 | enc text = (bytes, extra) where | |
209 | (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text | |
210 | bytes = B8.pack (T.unpack safe) | |
211 | extra = if T.null unsafe | |
212 | then Nothing | |
213 | else Just (EncodeException ascii (T.head unsafe), unsafe) | |
214 | ||
215 | dec bytes = (text, extra) where | |
216 | (safe, unsafe) = B.span (<= 0x7F) bytes | |
217 | text = T.pack (B8.unpack safe) | |
218 | extra = if B.null unsafe | |
219 | then Right B.empty | |
220 | else Left (DecodeException ascii (B.head unsafe), unsafe) | |
221 | ||
222 | iso8859_1 :: Codec | |
223 | iso8859_1 = Codec name enc (toDecoding dec) where | |
224 | name = T.pack "ISO-8859-1" | |
225 | enc text = (bytes, extra) where | |
226 | (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text | |
227 | bytes = B8.pack (T.unpack safe) | |
228 | extra = if T.null unsafe | |
229 | then Nothing | |
230 | else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) | |
231 | ||
232 | dec bytes = (T.pack (B8.unpack bytes), Right B.empty) | |
233 | ||
234 | tryEvaluate :: a -> Either Exc.SomeException a | |
235 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | |
236 | ||
237 | maybeDecode :: (a, b) -> Maybe (a, b) | |
238 | maybeDecode (a, b) = case tryEvaluate a of | |
239 | Left _ -> Nothing | |
240 | Right _ -> Just (a, b) |