1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
13 // These replacements permit compatibility with old numeric entities that
14 // assumed Windows-1252 encoding.
15 // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
16 var replacementTable = [...]rune{
17 '\u20AC', // First entry is what 0x80 should be replaced with.
48 '\u0178', // Last entry is 0x9F.
49 // 0x00->'\uFFFD' is handled programmatically.
50 // 0x0D->'\u000D' is a no-op.
53 // unescapeEntity reads an entity like "<" from b[src:] and writes the
54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55 // Precondition: b[src] == '&' && dst <= src.
56 // attribute should be true if parsing an attribute value.
57 func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
58 // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
60 // i starts at 1 because we already know that s[0] == '&'.
65 return dst + 1, src + 1
69 if len(s) <= 3 { // We need to have at least "&#.".
71 return dst + 1, src + 1
76 if c == 'x' || c == 'X' {
86 if '0' <= c && c <= '9' {
87 x = 16*x + rune(c) - '0'
89 } else if 'a' <= c && c <= 'f' {
90 x = 16*x + rune(c) - 'a' + 10
92 } else if 'A' <= c && c <= 'F' {
93 x = 16*x + rune(c) - 'A' + 10
96 } else if '0' <= c && c <= '9' {
97 x = 10*x + rune(c) - '0'
106 if i <= 3 { // No characters matched.
108 return dst + 1, src + 1
111 if 0x80 <= x && x <= 0x9F {
112 // Replace characters from Windows-1252 with UTF-8 equivalents.
113 x = replacementTable[x-0x80]
114 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
115 // Replace invalid characters with the replacement character.
119 return dst + utf8.EncodeRune(b[dst:], x), src + i
122 // Consume the maximum number of characters possible, with the
123 // consumed characters matching one of the named references.
128 // Lower-cased characters are more common in entities, so we check for them first.
129 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
138 entityName := string(s[1:i])
139 if entityName == "" {
141 } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
143 } else if x := entity[entityName]; x != 0 {
144 return dst + utf8.EncodeRune(b[dst:], x), src + i
145 } else if x := entity2[entityName]; x[0] != 0 {
146 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
147 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
148 } else if !attribute {
149 maxLen := len(entityName) - 1
150 if maxLen > longestEntityWithoutSemicolon {
151 maxLen = longestEntityWithoutSemicolon
153 for j := maxLen; j > 1; j-- {
154 if x := entity[entityName[:j]]; x != 0 {
155 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
160 dst1, src1 = dst+i, src+i
161 copy(b[dst:dst1], b[src:src1])
165 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
166 // attribute should be true if parsing an attribute value.
167 func unescape(b []byte, attribute bool) []byte {
168 for i, c := range b {
170 dst, src := unescapeEntity(b, i, i, attribute)
174 dst, src = unescapeEntity(b, dst, src, attribute)
177 dst, src = dst+1, src+1
186 // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
187 func lower(b []byte) []byte {
188 for i, c := range b {
189 if 'A' <= c && c <= 'Z' {
196 const escapedChars = "&'<>\"\r"
198 func escape(w writer, s string) error {
199 i := strings.IndexAny(s, escapedChars)
201 if _, err := w.WriteString(s[:i]); err != nil {
209 // "'" is shorter than "'" and apos was not in HTML until HTML5.
216 // """ is shorter than """.
221 panic("unrecognized escape character")
224 if _, err := w.WriteString(esc); err != nil {
227 i = strings.IndexAny(s, escapedChars)
229 _, err := w.WriteString(s)
233 // EscapeString escapes special characters like "<" to become "<". It
234 // escapes only five such characters: <, >, &, ' and ".
235 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
237 func EscapeString(s string) string {
238 if strings.IndexAny(s, escapedChars) == -1 {
246 // UnescapeString unescapes entities like "<" to become "<". It unescapes a
247 // larger range of entities than EscapeString escapes. For example, "á"
248 // unescapes to "รก", as does "á" and "&xE1;".
249 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
251 func UnescapeString(s string) string {
252 for _, c := range s {
254 return string(unescape([]byte(s), false))