]>
Commit | Line | Data |
---|---|---|
409759e8 | 1 | |
2 | {-# LANGUAGE DeriveDataTypeable, RankNTypes, BangPatterns #-} | |
3 | -- | | |
4 | -- Copyright: 2014 Michael Thompson, 2011 Michael Snoyman, 2010-2011 John Millikin | |
5 | -- License: MIT | |
4da91c37 | 6 | -- This Parts of this code were taken from enumerator and conduits, and adapted for pipes |
7 | {- | This module follows the model of the enumerator and conduits libraries, and defines | |
8 | 'Codec' s for various encodings. Note that we do not export a 'Codec' for ascii and | |
9 | iso8859_1. A 'Lens' in the sense of the pipes library cannot be defined for these, so | |
10 | special functions appear in @Pipes.Text@ | |
11 | -} | |
409759e8 | 12 | |
13 | module Pipes.Text.Internal.Codec | |
14 | ( Decoding(..) | |
15 | , streamDecodeUtf8 | |
16 | , decodeSomeUtf8 | |
17 | , Codec(..) | |
18 | , TextException(..) | |
19 | , utf8 | |
20 | , utf16_le | |
21 | , utf16_be | |
22 | , utf32_le | |
23 | , utf32_be | |
24 | ) where | |
25 | ||
26 | import Data.Bits ((.&.)) | |
27 | import Data.Char (ord) | |
28 | import Data.ByteString as B | |
29 | import Data.ByteString (ByteString) | |
30 | import Data.ByteString.Internal as B | |
31 | import Data.ByteString.Char8 as B8 | |
32 | import Data.Text (Text) | |
33 | import qualified Data.Text as T | |
34 | import qualified Data.Text.Encoding as TE | |
35 | import Data.Text.Encoding.Error () | |
36 | import GHC.Word (Word8, Word32) | |
37 | import qualified Data.Text.Array as A | |
38 | import Data.Word (Word8, Word16) | |
39 | import System.IO.Unsafe (unsafePerformIO) | |
40 | import qualified Control.Exception as Exc | |
41 | import Data.Bits ((.&.), (.|.), shiftL) | |
42 | import Data.Typeable | |
43 | import Control.Arrow (first) | |
44 | import Data.Maybe (catMaybes) | |
45 | import Pipes.Text.Internal.Decoding | |
46 | import Pipes | |
47 | -- | A specific character encoding. | |
4da91c37 | 48 | |
409759e8 | 49 | data Codec = Codec |
50 | { codecName :: Text | |
51 | , codecEncode :: Text -> (ByteString, Maybe (TextException, Text)) | |
4da91c37 | 52 | , codecDecode :: ByteString -> Decoding |
409759e8 | 53 | } |
54 | ||
55 | instance Show Codec where | |
56 | showsPrec d c = showParen (d > 10) $ | |
57 | showString "Codec " . shows (codecName c) | |
58 | ||
59 | data TextException = DecodeException Codec Word8 | |
60 | | EncodeException Codec Char | |
61 | | LengthExceeded Int | |
62 | | TextException Exc.SomeException | |
63 | deriving (Show, Typeable) | |
64 | instance Exc.Exception TextException | |
65 | ||
66 | ||
67 | toDecoding :: (ByteString -> (Text, Either (TextException, ByteString) ByteString)) | |
68 | -> (ByteString -> Decoding) | |
69 | toDecoding op = loop B.empty where | |
70 | loop !extra bs0 = case op (B.append extra bs0) of | |
71 | (txt, Right bs) -> Some txt bs (loop bs) | |
72 | (txt, Left (_,bs)) -> Other txt bs | |
73 | -- To do: toDecoding should be inlined in each of the 'Codec' definitions | |
74 | -- or else Codec changed to the conduit/enumerator definition. We have | |
75 | -- altered it to use 'streamDecodeUtf8' | |
76 | ||
77 | splitSlowly :: (ByteString -> Text) | |
78 | -> ByteString | |
79 | -> (Text, Either (TextException, ByteString) ByteString) | |
80 | splitSlowly dec bytes = valid where | |
81 | valid:_ = catMaybes $ Prelude.map decFirst $ splits (B.length bytes) | |
82 | splits 0 = [(B.empty, bytes)] | |
83 | splits n = B.splitAt n bytes : splits (n - 1) | |
84 | decFirst (a, b) = case tryEvaluate (dec a) of | |
85 | Left _ -> Nothing | |
86 | Right text -> let trouble = case tryEvaluate (dec b) of | |
87 | Left exc -> Left (TextException exc, b) | |
88 | Right _ -> Right B.empty | |
89 | in Just (text, trouble) -- this case shouldn't occur, | |
90 | -- since splitSlowly is only called | |
91 | -- when parsing failed somewhere | |
92 | ||
93 | utf8 :: Codec | |
94 | utf8 = Codec name enc (toDecoding dec) where | |
95 | name = T.pack "UTF-8" | |
96 | enc text = (TE.encodeUtf8 text, Nothing) | |
97 | dec bytes = case decodeSomeUtf8 bytes of (t,b) -> (t, Right b) | |
98 | ||
99 | -- -- Whether the given byte is a continuation byte. | |
100 | -- isContinuation byte = byte .&. 0xC0 == 0x80 | |
101 | -- | |
102 | -- -- The number of continuation bytes needed by the given | |
103 | -- -- non-continuation byte. Returns -1 for an illegal UTF-8 | |
104 | -- -- non-continuation byte and the whole split quickly must fail so | |
105 | -- -- as the input is passed to TE.decodeUtf8, which will issue a | |
106 | -- -- suitable error. | |
107 | -- required x0 | |
108 | -- | x0 .&. 0x80 == 0x00 = 0 | |
109 | -- | x0 .&. 0xE0 == 0xC0 = 1 | |
110 | -- | x0 .&. 0xF0 == 0xE0 = 2 | |
111 | -- | x0 .&. 0xF8 == 0xF0 = 3 | |
112 | -- | otherwise = -1 | |
113 | -- | |
114 | -- splitQuickly bytes | |
115 | -- | B.null l || req == -1 = Nothing | |
116 | -- | req == B.length r = Just (TE.decodeUtf8 bytes, B.empty) | |
117 | -- | otherwise = Just (TE.decodeUtf8 l', r') | |
118 | -- where | |
119 | -- (l, r) = B.spanEnd isContinuation bytes | |
120 | -- req = required (B.last l) | |
121 | -- l' = B.init l | |
122 | -- r' = B.cons (B.last l) r | |
123 | ||
124 | ||
125 | utf16_le :: Codec | |
126 | utf16_le = Codec name enc (toDecoding dec) where | |
127 | name = T.pack "UTF-16-LE" | |
128 | enc text = (TE.encodeUtf16LE text, Nothing) | |
129 | dec bytes = case splitQuickly bytes of | |
130 | Just (text, extra) -> (text, Right extra) | |
131 | Nothing -> splitSlowly TE.decodeUtf16LE bytes | |
132 | ||
133 | splitQuickly bytes = maybeDecode (loop 0) where | |
134 | maxN = B.length bytes | |
135 | ||
136 | loop n | n == maxN = decodeAll | |
137 | | (n + 1) == maxN = decodeTo n | |
138 | loop n = let | |
139 | req = utf16Required | |
140 | (B.index bytes n) | |
141 | (B.index bytes (n + 1)) | |
142 | decodeMore = loop $! n + req | |
143 | in if n + req > maxN | |
144 | then decodeTo n | |
145 | else decodeMore | |
146 | ||
147 | decodeTo n = first TE.decodeUtf16LE (B.splitAt n bytes) | |
148 | decodeAll = (TE.decodeUtf16LE bytes, B.empty) | |
149 | ||
150 | utf16_be :: Codec | |
151 | utf16_be = Codec name enc (toDecoding dec) where | |
152 | name = T.pack "UTF-16-BE" | |
153 | enc text = (TE.encodeUtf16BE text, Nothing) | |
154 | dec bytes = case splitQuickly bytes of | |
155 | Just (text, extra) -> (text, Right extra) | |
156 | Nothing -> splitSlowly TE.decodeUtf16BE bytes | |
157 | ||
158 | splitQuickly bytes = maybeDecode (loop 0) where | |
159 | maxN = B.length bytes | |
160 | ||
161 | loop n | n == maxN = decodeAll | |
162 | | (n + 1) == maxN = decodeTo n | |
163 | loop n = let | |
164 | req = utf16Required | |
165 | (B.index bytes (n + 1)) | |
166 | (B.index bytes n) | |
167 | decodeMore = loop $! n + req | |
168 | in if n + req > maxN | |
169 | then decodeTo n | |
170 | else decodeMore | |
171 | ||
172 | decodeTo n = first TE.decodeUtf16BE (B.splitAt n bytes) | |
173 | decodeAll = (TE.decodeUtf16BE bytes, B.empty) | |
174 | ||
175 | utf16Required :: Word8 -> Word8 -> Int | |
176 | utf16Required x0 x1 = if x >= 0xD800 && x <= 0xDBFF then 4 else 2 where | |
177 | x :: Word16 | |
178 | x = (fromIntegral x1 `shiftL` 8) .|. fromIntegral x0 | |
179 | ||
180 | ||
181 | utf32_le :: Codec | |
182 | utf32_le = Codec name enc (toDecoding dec) where | |
183 | name = T.pack "UTF-32-LE" | |
184 | enc text = (TE.encodeUtf32LE text, Nothing) | |
185 | dec bs = case utf32SplitBytes TE.decodeUtf32LE bs of | |
186 | Just (text, extra) -> (text, Right extra) | |
187 | Nothing -> splitSlowly TE.decodeUtf32LE bs | |
188 | ||
189 | ||
190 | utf32_be :: Codec | |
191 | utf32_be = Codec name enc (toDecoding dec) where | |
192 | name = T.pack "UTF-32-BE" | |
193 | enc text = (TE.encodeUtf32BE text, Nothing) | |
194 | dec bs = case utf32SplitBytes TE.decodeUtf32BE bs of | |
195 | Just (text, extra) -> (text, Right extra) | |
196 | Nothing -> splitSlowly TE.decodeUtf32BE bs | |
197 | ||
198 | utf32SplitBytes :: (ByteString -> Text) | |
199 | -> ByteString | |
200 | -> Maybe (Text, ByteString) | |
201 | utf32SplitBytes dec bytes = split where | |
202 | split = maybeDecode (dec toDecode, extra) | |
203 | len = B.length bytes | |
204 | lenExtra = mod len 4 | |
205 | ||
206 | lenToDecode = len - lenExtra | |
207 | (toDecode, extra) = if lenExtra == 0 | |
208 | then (bytes, B.empty) | |
209 | else B.splitAt lenToDecode bytes | |
210 | ||
211 | ||
212 | tryEvaluate :: a -> Either Exc.SomeException a | |
213 | tryEvaluate = unsafePerformIO . Exc.try . Exc.evaluate | |
214 | ||
215 | maybeDecode :: (a, b) -> Maybe (a, b) | |
216 | maybeDecode (a, b) = case tryEvaluate a of | |
217 | Left _ -> Nothing | |
218 | Right _ -> Just (a, b) |