]>
Commit | Line | Data |
---|---|---|
d4949327 NL |
1 | <?php\r |
2 | \r | |
3 | /**\r | |
4 | * Validates a font family list according to CSS spec\r | |
5 | */\r | |
6 | class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef\r | |
7 | {\r | |
8 | \r | |
9 | protected $mask = null;\r | |
10 | \r | |
11 | public function __construct()\r | |
12 | {\r | |
13 | $this->mask = '_- ';\r | |
14 | for ($c = 'a'; $c <= 'z'; $c++) {\r | |
15 | $this->mask .= $c;\r | |
16 | }\r | |
17 | for ($c = 'A'; $c <= 'Z'; $c++) {\r | |
18 | $this->mask .= $c;\r | |
19 | }\r | |
20 | for ($c = '0'; $c <= '9'; $c++) {\r | |
21 | $this->mask .= $c;\r | |
22 | } // cast-y, but should be fine\r | |
23 | // special bytes used by UTF-8\r | |
24 | for ($i = 0x80; $i <= 0xFF; $i++) {\r | |
25 | // We don't bother excluding invalid bytes in this range,\r | |
26 | // because the our restriction of well-formed UTF-8 will\r | |
27 | // prevent these from ever occurring.\r | |
28 | $this->mask .= chr($i);\r | |
29 | }\r | |
30 | \r | |
31 | /*\r | |
32 | PHP's internal strcspn implementation is\r | |
33 | O(length of string * length of mask), making it inefficient\r | |
34 | for large masks. However, it's still faster than\r | |
35 | preg_match 8)\r | |
36 | for (p = s1;;) {\r | |
37 | spanp = s2;\r | |
38 | do {\r | |
39 | if (*spanp == c || p == s1_end) {\r | |
40 | return p - s1;\r | |
41 | }\r | |
42 | } while (spanp++ < (s2_end - 1));\r | |
43 | c = *++p;\r | |
44 | }\r | |
45 | */\r | |
46 | // possible optimization: invert the mask.\r | |
47 | }\r | |
48 | \r | |
49 | /**\r | |
50 | * @param string $string\r | |
51 | * @param HTMLPurifier_Config $config\r | |
52 | * @param HTMLPurifier_Context $context\r | |
53 | * @return bool|string\r | |
54 | */\r | |
55 | public function validate($string, $config, $context)\r | |
56 | {\r | |
57 | static $generic_names = array(\r | |
58 | 'serif' => true,\r | |
59 | 'sans-serif' => true,\r | |
60 | 'monospace' => true,\r | |
61 | 'fantasy' => true,\r | |
62 | 'cursive' => true\r | |
63 | );\r | |
64 | $allowed_fonts = $config->get('CSS.AllowedFonts');\r | |
65 | \r | |
66 | // assume that no font names contain commas in them\r | |
67 | $fonts = explode(',', $string);\r | |
68 | $final = '';\r | |
69 | foreach ($fonts as $font) {\r | |
70 | $font = trim($font);\r | |
71 | if ($font === '') {\r | |
72 | continue;\r | |
73 | }\r | |
74 | // match a generic name\r | |
75 | if (isset($generic_names[$font])) {\r | |
76 | if ($allowed_fonts === null || isset($allowed_fonts[$font])) {\r | |
77 | $final .= $font . ', ';\r | |
78 | }\r | |
79 | continue;\r | |
80 | }\r | |
81 | // match a quoted name\r | |
82 | if ($font[0] === '"' || $font[0] === "'") {\r | |
83 | $length = strlen($font);\r | |
84 | if ($length <= 2) {\r | |
85 | continue;\r | |
86 | }\r | |
87 | $quote = $font[0];\r | |
88 | if ($font[$length - 1] !== $quote) {\r | |
89 | continue;\r | |
90 | }\r | |
91 | $font = substr($font, 1, $length - 2);\r | |
92 | }\r | |
93 | \r | |
94 | $font = $this->expandCSSEscape($font);\r | |
95 | \r | |
96 | // $font is a pure representation of the font name\r | |
97 | \r | |
98 | if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {\r | |
99 | continue;\r | |
100 | }\r | |
101 | \r | |
102 | if (ctype_alnum($font) && $font !== '') {\r | |
103 | // very simple font, allow it in unharmed\r | |
104 | $final .= $font . ', ';\r | |
105 | continue;\r | |
106 | }\r | |
107 | \r | |
108 | // bugger out on whitespace. form feed (0C) really\r | |
109 | // shouldn't show up regardless\r | |
110 | $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);\r | |
111 | \r | |
112 | // Here, there are various classes of characters which need\r | |
113 | // to be treated differently:\r | |
114 | // - Alphanumeric characters are essentially safe. We\r | |
115 | // handled these above.\r | |
116 | // - Spaces require quoting, though most parsers will do\r | |
117 | // the right thing if there aren't any characters that\r | |
118 | // can be misinterpreted\r | |
119 | // - Dashes rarely occur, but they fairly unproblematic\r | |
120 | // for parsing/rendering purposes.\r | |
121 | // The above characters cover the majority of Western font\r | |
122 | // names.\r | |
123 | // - Arbitrary Unicode characters not in ASCII. Because\r | |
124 | // most parsers give little thought to Unicode, treatment\r | |
125 | // of these codepoints is basically uniform, even for\r | |
126 | // punctuation-like codepoints. These characters can\r | |
127 | // show up in non-Western pages and are supported by most\r | |
128 | // major browsers, for example: "MS 明朝" is a\r | |
129 | // legitimate font-name\r | |
130 | // <http://ja.wikipedia.org/wiki/MS_明朝>. See\r | |
131 | // the CSS3 spec for more examples:\r | |
132 | // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>\r | |
133 | // You can see live samples of these on the Internet:\r | |
134 | // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>\r | |
135 | // However, most of these fonts have ASCII equivalents:\r | |
136 | // for example, 'MS Mincho', and it's considered\r | |
137 | // professional to use ASCII font names instead of\r | |
138 | // Unicode font names. Thanks Takeshi Terada for\r | |
139 | // providing this information.\r | |
140 | // The following characters, to my knowledge, have not been\r | |
141 | // used to name font names.\r | |
142 | // - Single quote. While theoretically you might find a\r | |
143 | // font name that has a single quote in its name (serving\r | |
144 | // as an apostrophe, e.g. Dave's Scribble), I haven't\r | |
145 | // been able to find any actual examples of this.\r | |
146 | // Internet Explorer's cssText translation (which I\r | |
147 | // believe is invoked by innerHTML) normalizes any\r | |
148 | // quoting to single quotes, and fails to escape single\r | |
149 | // quotes. (Note that this is not IE's behavior for all\r | |
150 | // CSS properties, just some sort of special casing for\r | |
151 | // font-family). So a single quote *cannot* be used\r | |
152 | // safely in the font-family context if there will be an\r | |
153 | // innerHTML/cssText translation. Note that Firefox 3.x\r | |
154 | // does this too.\r | |
155 | // - Double quote. In IE, these get normalized to\r | |
156 | // single-quotes, no matter what the encoding. (Fun\r | |
157 | // fact, in IE8, the 'content' CSS property gained\r | |
158 | // support, where they special cased to preserve encoded\r | |
159 | // double quotes, but still translate unadorned double\r | |
160 | // quotes into single quotes.) So, because their\r | |
161 | // fixpoint behavior is identical to single quotes, they\r | |
162 | // cannot be allowed either. Firefox 3.x displays\r | |
163 | // single-quote style behavior.\r | |
164 | // - Backslashes are reduced by one (so \\ -> \) every\r | |
165 | // iteration, so they cannot be used safely. This shows\r | |
166 | // up in IE7, IE8 and FF3\r | |
167 | // - Semicolons, commas and backticks are handled properly.\r | |
168 | // - The rest of the ASCII punctuation is handled properly.\r | |
169 | // We haven't checked what browsers do to unadorned\r | |
170 | // versions, but this is not important as long as the\r | |
171 | // browser doesn't /remove/ surrounding quotes (as IE does\r | |
172 | // for HTML).\r | |
173 | //\r | |
174 | // With these results in hand, we conclude that there are\r | |
175 | // various levels of safety:\r | |
176 | // - Paranoid: alphanumeric, spaces and dashes(?)\r | |
177 | // - International: Paranoid + non-ASCII Unicode\r | |
178 | // - Edgy: Everything except quotes, backslashes\r | |
179 | // - NoJS: Standards compliance, e.g. sod IE. Note that\r | |
180 | // with some judicious character escaping (since certain\r | |
181 | // types of escaping doesn't work) this is theoretically\r | |
182 | // OK as long as innerHTML/cssText is not called.\r | |
183 | // We believe that international is a reasonable default\r | |
184 | // (that we will implement now), and once we do more\r | |
185 | // extensive research, we may feel comfortable with dropping\r | |
186 | // it down to edgy.\r | |
187 | \r | |
188 | // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of\r | |
189 | // str(c)spn assumes that the string was already well formed\r | |
190 | // Unicode (which of course it is).\r | |
191 | if (strspn($font, $this->mask) !== strlen($font)) {\r | |
192 | continue;\r | |
193 | }\r | |
194 | \r | |
195 | // Historical:\r | |
196 | // In the absence of innerHTML/cssText, these ugly\r | |
197 | // transforms don't pose a security risk (as \\ and \"\r | |
198 | // might--these escapes are not supported by most browsers).\r | |
199 | // We could try to be clever and use single-quote wrapping\r | |
200 | // when there is a double quote present, but I have choosen\r | |
201 | // not to implement that. (NOTE: you can reduce the amount\r | |
202 | // of escapes by one depending on what quoting style you use)\r | |
203 | // $font = str_replace('\\', '\\5C ', $font);\r | |
204 | // $font = str_replace('"', '\\22 ', $font);\r | |
205 | // $font = str_replace("'", '\\27 ', $font);\r | |
206 | \r | |
207 | // font possibly with spaces, requires quoting\r | |
208 | $final .= "'$font', ";\r | |
209 | }\r | |
210 | $final = rtrim($final, ', ');\r | |
211 | if ($final === '') {\r | |
212 | return false;\r | |
213 | }\r | |
214 | return $final;\r | |
215 | }\r | |
216 | \r | |
217 | }\r | |
218 | \r | |
219 | // vim: et sw=4 sts=4\r |