]>
Commit | Line | Data |
---|---|---|
409759e8 | 1 | |
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} | |
3 | -- | | |
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | |
5 | -- License: MIT | |
4da91c37 | 6 | -- This Parts of this code were taken from enumerator and conduits, and adapted for pipes |
b23136b1 | 7 | |
8 | -- This module follows the model of the enumerator and conduits libraries, and defines | |
9 | -- 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and | |
10 | -- iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so | |
11 | -- special functions appear in @Pipes.Text@ | |
12 | ||
409759e8 | 13 | |
14 | module Pipes.Text.Internal.Codec | |
b23136b1 | 15 | ( Codec(..) |
409759e8 | 16 | , TextException(..) |
17 | , utf8 | |
18 | , utf16_le | |
19 | , utf16_be | |
20 | , utf32_le | |
21 | , utf32_be | |
22 | ) where | |
23 | ||
24 | import Data.Bits ((.&.)) | |
25 | import Data.Char (ord) | |
26 | import Data.ByteString as B | |
27 | import Data.ByteString (ByteString) | |
28 | import Data.ByteString.Internal as B | |
29 | import Data.ByteString.Char8 as B8 | |
30 | import Data.Text (Text) | |
31 | import qualified Data.Text as T | |
32 | import qualified Data.Text.Encoding as TE | |
33 | import Data.Text.Encoding.Error () | |
34 | import GHC.Word (Word8, Word32) | |
35 | import qualified Data.Text.Array as A | |
36 | import Data.Word (Word8, Word16) | |
37 | import System.IO.Unsafe (unsafePerformIO) | |
38 | import qualified Control.Exception as Exc | |
39 | import Data.Bits ((.&.), (.|.), shiftL) | |
40 | import Data.Typeable | |
41 | import Control.Arrow (first) | |
42 | import Data.Maybe (catMaybes) | |
43 | import Pipes.Text.Internal.Decoding | |
44 | import Pipes | |
45 | -- | A specific character encoding. | |
4da91c37 | 46 | |
409759e8 | 47 | data Codec = Codec |
48 | { codecName :: Text | |
49 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | |
4da91c37 | 50 | , codecDecode :: ByteString -> Decoding |
409759e8 | 51 | } |
52 | ||
53 | instance Show Codec where | |
54 | showsPrec d c = showParen (d > 10) $ | |
55 | showString "Codec " . shows (codecName c) | |
56 | ||
57 | data TextException = DecodeException Codec Word8 | |
58 | | EncodeException Codec Char | |
59 | | LengthExceeded Int | |
60 | | TextException Exc.SomeException | |
61 | deriving (Show, Typeable) | |
62 | instance Exc.Exception TextException | |
63 | ||
64 | ||
65 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | |
66 | -> (ByteString -> Decoding) | |
67 | toDecoding op = loop B.empty where | |
68 | loop !extra bs0 = case op (B.append extra bs0) of | |
69 | (txt, Right bs) -> Some txt bs (loop bs) | |
70 | (txt, Left (_,bs)) -> Other txt bs | |
71 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | |
72 | -- or else Codec changed to the conduit/enumerator definition. We have | |
73 | -- altered it to use 'streamDecodeUtf8' | |
74 | ||
75 | splitSlowly :: (ByteString -> Text) | |
76 | -> ByteString | |
77 | -> (Text, Either (TextException, ByteString) ByteString) | |
78 | splitSlowly dec bytes = valid where | |
79 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | |
80 | splits 0 = [(B.empty, bytes)] | |
81 | splits n = B.splitAt n bytes : splits (n - 1) | |
82 | decFirst (a, b) = case tryEvaluate (dec a) of | |
83 | Left _ -> Nothing | |
84 | Right text -> let trouble = case tryEvaluate (dec b) of | |
85 | Left exc -> Left (TextException exc, b) | |
86 | Right _ -> Right B.empty | |
87 | in Just (text, trouble) -- this case shouldn't occur, | |
88 | -- since splitSlowly is only called | |
89 | -- when parsing failed somewhere | |
90 | ||
91 | utf8 :: Codec | |
92 | utf8 = Codec name enc (toDecoding dec) where | |
93 | name = T.pack "UTF-8" | |
94 | enc text = (TE.encodeUtf8 text, Nothing) | |
95 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) | |
96 | ||
97 | -- -- Whether the given byte is a continuation byte. | |
98 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | |
99 | -- | |
100 | -- -- The number of continuation bytes needed by the given | |
101 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | |
102 | -- -- non-continuation byte and the whole split quickly must fail so | |
103 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | |
104 | -- -- suitable error. | |
105 | -- required x0 | |
106 | -- | x0 .&. 0x80 == 0x00 = 0 | |
107 | -- | x0 .&. 0xE0 == 0xC0 = 1 | |
108 | -- | x0 .&. 0xF0 == 0xE0 = 2 | |
109 | -- | x0 .&. 0xF8 == 0xF0 = 3 | |
110 | -- | otherwise = -1 | |
111 | -- | |
112 | -- splitQuickly bytes | |
113 | -- | B.null l || req == -1 = Nothing | |
114 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | |
115 | -- | otherwise = Just (TE.decodeUtf8 l', r') | |
116 | -- where | |
117 | -- (l, r) = B.spanEnd isContinuation bytes | |
118 | -- req = required (B.last l) | |
119 | -- l' = B.init l | |
120 | -- r' = B.cons (B.last l) r | |
121 | ||
122 | ||
123 | utf16_le :: Codec | |
124 | utf16_le = Codec name enc (toDecoding dec) where | |
125 | name = T.pack "UTF-16-LE" | |
126 | enc text = (TE.encodeUtf16LE text, Nothing) | |
127 | dec bytes = case splitQuickly bytes of | |
128 | Just (text, extra) -> (text, Right extra) | |
129 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | |
130 | ||
131 | splitQuickly bytes = maybeDecode (loop 0) where | |
132 | maxN = B.length bytes | |
133 | ||
134 | loop n | n == maxN = decodeAll | |
135 | | (n + 1) == maxN = decodeTo n | |
136 | loop n = let | |
137 | req = utf16Required | |
138 | (B.index bytes n) | |
139 | (B.index bytes (n + 1)) | |
140 | decodeMore = loop $! n + req | |
141 | in if n + req > maxN | |
142 | then decodeTo n | |
143 | else decodeMore | |
144 | ||
145 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | |
146 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | |
147 | ||
148 | utf16_be :: Codec | |
149 | utf16_be = Codec name enc (toDecoding dec) where | |
150 | name = T.pack "UTF-16-BE" | |
151 | enc text = (TE.encodeUtf16BE text, Nothing) | |
152 | dec bytes = case splitQuickly bytes of | |
153 | Just (text, extra) -> (text, Right extra) | |
154 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | |
155 | ||
156 | splitQuickly bytes = maybeDecode (loop 0) where | |
157 | maxN = B.length bytes | |
158 | ||
159 | loop n | n == maxN = decodeAll | |
160 | | (n + 1) == maxN = decodeTo n | |
161 | loop n = let | |
162 | req = utf16Required | |
163 | (B.index bytes (n + 1)) | |
164 | (B.index bytes n) | |
165 | decodeMore = loop $! n + req | |
166 | in if n + req > maxN | |
167 | then decodeTo n | |
168 | else decodeMore | |
169 | ||
170 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | |
171 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | |
172 | ||
173 | utf16Required :: Word8 -> Word8 -> Int | |
174 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | |
175 | x :: Word16 | |
176 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | |
177 | ||
178 | ||
179 | utf32_le :: Codec | |
180 | utf32_le = Codec name enc (toDecoding dec) where | |
181 | name = T.pack "UTF-32-LE" | |
182 | enc text = (TE.encodeUtf32LE text, Nothing) | |
183 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | |
184 | Just (text, extra) -> (text, Right extra) | |
185 | Nothing -> splitSlowly TE.decodeUtf32LE bs | |
186 | ||
187 | ||
188 | utf32_be :: Codec | |
189 | utf32_be = Codec name enc (toDecoding dec) where | |
190 | name = T.pack "UTF-32-BE" | |
191 | enc text = (TE.encodeUtf32BE text, Nothing) | |
192 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | |
193 | Just (text, extra) -> (text, Right extra) | |
194 | Nothing -> splitSlowly TE.decodeUtf32BE bs | |
195 | ||
196 | utf32SplitBytes :: (ByteString -> Text) | |
197 | -> ByteString | |
198 | -> Maybe (Text, ByteString) | |
199 | utf32SplitBytes dec bytes = split where | |
200 | split = maybeDecode (dec toDecode, extra) | |
201 | len = B.length bytes | |
202 | lenExtra = mod len 4 | |
203 | ||
204 | lenToDecode = len - lenExtra | |
205 | (toDecode, extra) = if lenExtra == 0 | |
206 | then (bytes, B.empty) | |
207 | else B.splitAt lenToDecode bytes | |
208 | ||
209 | ||
210 | tryEvaluate :: a -> Either Exc.SomeException a | |
211 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | |
212 | ||
213 | maybeDecode :: (a, b) -> Maybe (a, b) | |
214 | maybeDecode (a, b) = case tryEvaluate a of | |
215 | Left _ -> Nothing | |
216 | Right _ -> Just (a, b) |