]>
Commit | Line | Data |
---|---|---|
c680a8e1 RS |
1 | // Copyright 2010 The Go Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package html | |
6 | ||
7 | import ( | |
8 | "bytes" | |
9 | "strings" | |
10 | "unicode/utf8" | |
11 | ) | |
12 | ||
13 | // These replacements permit compatibility with old numeric entities that | |
14 | // assumed Windows-1252 encoding. | |
15 | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference | |
16 | var replacementTable = [...]rune{ | |
17 | '\u20AC', // First entry is what 0x80 should be replaced with. | |
18 | '\u0081', | |
19 | '\u201A', | |
20 | '\u0192', | |
21 | '\u201E', | |
22 | '\u2026', | |
23 | '\u2020', | |
24 | '\u2021', | |
25 | '\u02C6', | |
26 | '\u2030', | |
27 | '\u0160', | |
28 | '\u2039', | |
29 | '\u0152', | |
30 | '\u008D', | |
31 | '\u017D', | |
32 | '\u008F', | |
33 | '\u0090', | |
34 | '\u2018', | |
35 | '\u2019', | |
36 | '\u201C', | |
37 | '\u201D', | |
38 | '\u2022', | |
39 | '\u2013', | |
40 | '\u2014', | |
41 | '\u02DC', | |
42 | '\u2122', | |
43 | '\u0161', | |
44 | '\u203A', | |
45 | '\u0153', | |
46 | '\u009D', | |
47 | '\u017E', | |
48 | '\u0178', // Last entry is 0x9F. | |
49 | // 0x00->'\uFFFD' is handled programmatically. | |
50 | // 0x0D->'\u000D' is a no-op. | |
51 | } | |
52 | ||
53 | // unescapeEntity reads an entity like "<" from b[src:] and writes the | |
54 | // corresponding "<" to b[dst:], returning the incremented dst and src cursors. | |
55 | // Precondition: b[src] == '&' && dst <= src. | |
56 | // attribute should be true if parsing an attribute value. | |
57 | func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { | |
58 | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference | |
59 | ||
60 | // i starts at 1 because we already know that s[0] == '&'. | |
61 | i, s := 1, b[src:] | |
62 | ||
63 | if len(s) <= 1 { | |
64 | b[dst] = b[src] | |
65 | return dst + 1, src + 1 | |
66 | } | |
67 | ||
68 | if s[i] == '#' { | |
69 | if len(s) <= 3 { // We need to have at least "&#.". | |
70 | b[dst] = b[src] | |
71 | return dst + 1, src + 1 | |
72 | } | |
73 | i++ | |
74 | c := s[i] | |
75 | hex := false | |
76 | if c == 'x' || c == 'X' { | |
77 | hex = true | |
78 | i++ | |
79 | } | |
80 | ||
81 | x := '\x00' | |
82 | for i < len(s) { | |
83 | c = s[i] | |
84 | i++ | |
85 | if hex { | |
86 | if '0' <= c && c <= '9' { | |
87 | x = 16*x + rune(c) - '0' | |
88 | continue | |
89 | } else if 'a' <= c && c <= 'f' { | |
90 | x = 16*x + rune(c) - 'a' + 10 | |
91 | continue | |
92 | } else if 'A' <= c && c <= 'F' { | |
93 | x = 16*x + rune(c) - 'A' + 10 | |
94 | continue | |
95 | } | |
96 | } else if '0' <= c && c <= '9' { | |
97 | x = 10*x + rune(c) - '0' | |
98 | continue | |
99 | } | |
100 | if c != ';' { | |
101 | i-- | |
102 | } | |
103 | break | |
104 | } | |
105 | ||
106 | if i <= 3 { // No characters matched. | |
107 | b[dst] = b[src] | |
108 | return dst + 1, src + 1 | |
109 | } | |
110 | ||
111 | if 0x80 <= x && x <= 0x9F { | |
112 | // Replace characters from Windows-1252 with UTF-8 equivalents. | |
113 | x = replacementTable[x-0x80] | |
114 | } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { | |
115 | // Replace invalid characters with the replacement character. | |
116 | x = '\uFFFD' | |
117 | } | |
118 | ||
119 | return dst + utf8.EncodeRune(b[dst:], x), src + i | |
120 | } | |
121 | ||
122 | // Consume the maximum number of characters possible, with the | |
123 | // consumed characters matching one of the named references. | |
124 | ||
125 | for i < len(s) { | |
126 | c := s[i] | |
127 | i++ | |
128 | // Lower-cased characters are more common in entities, so we check for them first. | |
129 | if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { | |
130 | continue | |
131 | } | |
132 | if c != ';' { | |
133 | i-- | |
134 | } | |
135 | break | |
136 | } | |
137 | ||
138 | entityName := string(s[1:i]) | |
139 | if entityName == "" { | |
140 | // No-op. | |
141 | } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { | |
142 | // No-op. | |
143 | } else if x := entity[entityName]; x != 0 { | |
144 | return dst + utf8.EncodeRune(b[dst:], x), src + i | |
145 | } else if x := entity2[entityName]; x[0] != 0 { | |
146 | dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) | |
147 | return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i | |
148 | } else if !attribute { | |
149 | maxLen := len(entityName) - 1 | |
150 | if maxLen > longestEntityWithoutSemicolon { | |
151 | maxLen = longestEntityWithoutSemicolon | |
152 | } | |
153 | for j := maxLen; j > 1; j-- { | |
154 | if x := entity[entityName[:j]]; x != 0 { | |
155 | return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 | |
156 | } | |
157 | } | |
158 | } | |
159 | ||
160 | dst1, src1 = dst+i, src+i | |
161 | copy(b[dst:dst1], b[src:src1]) | |
162 | return dst1, src1 | |
163 | } | |
164 | ||
165 | // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". | |
166 | // attribute should be true if parsing an attribute value. | |
167 | func unescape(b []byte, attribute bool) []byte { | |
168 | for i, c := range b { | |
169 | if c == '&' { | |
170 | dst, src := unescapeEntity(b, i, i, attribute) | |
171 | for src < len(b) { | |
172 | c := b[src] | |
173 | if c == '&' { | |
174 | dst, src = unescapeEntity(b, dst, src, attribute) | |
175 | } else { | |
176 | b[dst] = c | |
177 | dst, src = dst+1, src+1 | |
178 | } | |
179 | } | |
180 | return b[0:dst] | |
181 | } | |
182 | } | |
183 | return b | |
184 | } | |
185 | ||
186 | // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc". | |
187 | func lower(b []byte) []byte { | |
188 | for i, c := range b { | |
189 | if 'A' <= c && c <= 'Z' { | |
190 | b[i] = c + 'a' - 'A' | |
191 | } | |
192 | } | |
193 | return b | |
194 | } | |
195 | ||
196 | const escapedChars = "&'<>\"\r" | |
197 | ||
198 | func escape(w writer, s string) error { | |
199 | i := strings.IndexAny(s, escapedChars) | |
200 | for i != -1 { | |
201 | if _, err := w.WriteString(s[:i]); err != nil { | |
202 | return err | |
203 | } | |
204 | var esc string | |
205 | switch s[i] { | |
206 | case '&': | |
207 | esc = "&" | |
208 | case '\'': | |
209 | // "'" is shorter than "'" and apos was not in HTML until HTML5. | |
210 | esc = "'" | |
211 | case '<': | |
212 | esc = "<" | |
213 | case '>': | |
214 | esc = ">" | |
215 | case '"': | |
216 | // """ is shorter than """. | |
217 | esc = """ | |
218 | case '\r': | |
219 | esc = " " | |
220 | default: | |
221 | panic("unrecognized escape character") | |
222 | } | |
223 | s = s[i+1:] | |
224 | if _, err := w.WriteString(esc); err != nil { | |
225 | return err | |
226 | } | |
227 | i = strings.IndexAny(s, escapedChars) | |
228 | } | |
229 | _, err := w.WriteString(s) | |
230 | return err | |
231 | } | |
232 | ||
233 | // EscapeString escapes special characters like "<" to become "<". It | |
234 | // escapes only five such characters: <, >, &, ' and ". | |
235 | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | |
236 | // always true. | |
237 | func EscapeString(s string) string { | |
238 | if strings.IndexAny(s, escapedChars) == -1 { | |
239 | return s | |
240 | } | |
241 | var buf bytes.Buffer | |
242 | escape(&buf, s) | |
243 | return buf.String() | |
244 | } | |
245 | ||
246 | // UnescapeString unescapes entities like "<" to become "<". It unescapes a | |
247 | // larger range of entities than EscapeString escapes. For example, "á" | |
248 | // unescapes to "รก", as does "á" and "&xE1;". | |
249 | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | |
250 | // always true. | |
251 | func UnescapeString(s string) string { | |
252 | for _, c := range s { | |
253 | if c == '&' { | |
254 | return string(unescape([]byte(s), false)) | |
255 | } | |
256 | } | |
257 | return s | |
258 | } |