[github/wallabag/wallabag.git] / inc / 3rdparty / htmlpurifier / HTMLPurifier / AttrDef / CSS / FontFamily.php

<?php\r
\r
/**\r
 * Validates a font family list according to CSS spec\r
 */\r
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef\r
{\r
\r
    protected $mask = null;\r
\r
    public function __construct()\r
    {\r
        $this->mask = '_- ';\r
        for ($c = 'a'; $c <= 'z'; $c++) {\r
            $this->mask .= $c;\r
        }\r
        for ($c = 'A'; $c <= 'Z'; $c++) {\r
            $this->mask .= $c;\r
        }\r
        for ($c = '0'; $c <= '9'; $c++) {\r
            $this->mask .= $c;\r
        } // cast-y, but should be fine\r
        // special bytes used by UTF-8\r
        for ($i = 0x80; $i <= 0xFF; $i++) {\r
            // We don't bother excluding invalid bytes in this range,\r
            // because the our restriction of well-formed UTF-8 will\r
            // prevent these from ever occurring.\r
            $this->mask .= chr($i);\r
        }\r
\r
        /*\r
            PHP's internal strcspn implementation is\r
            O(length of string * length of mask), making it inefficient\r
            for large masks.  However, it's still faster than\r
            preg_match 8)\r
          for (p = s1;;) {\r
            spanp = s2;\r
            do {\r
              if (*spanp == c || p == s1_end) {\r
                return p - s1;\r
              }\r
            } while (spanp++ < (s2_end - 1));\r
            c = *++p;\r
          }\r
         */\r
        // possible optimization: invert the mask.\r
    }\r
\r
    /**\r
     * @param string $string\r
     * @param HTMLPurifier_Config $config\r
     * @param HTMLPurifier_Context $context\r
     * @return bool|string\r
     */\r
    public function validate($string, $config, $context)\r
    {\r
        static $generic_names = array(\r
            'serif' => true,\r
            'sans-serif' => true,\r
            'monospace' => true,\r
            'fantasy' => true,\r
            'cursive' => true\r
        );\r
        $allowed_fonts = $config->get('CSS.AllowedFonts');\r
\r
        // assume that no font names contain commas in them\r
        $fonts = explode(',', $string);\r
        $final = '';\r
        foreach ($fonts as $font) {\r
            $font = trim($font);\r
            if ($font === '') {\r
                continue;\r
            }\r
            // match a generic name\r
            if (isset($generic_names[$font])) {\r
                if ($allowed_fonts === null || isset($allowed_fonts[$font])) {\r
                    $final .= $font . ', ';\r
                }\r
                continue;\r
            }\r
            // match a quoted name\r
            if ($font[0] === '"' || $font[0] === "'") {\r
                $length = strlen($font);\r
                if ($length <= 2) {\r
                    continue;\r
                }\r
                $quote = $font[0];\r
                if ($font[$length - 1] !== $quote) {\r
                    continue;\r
                }\r
                $font = substr($font, 1, $length - 2);\r
            }\r
\r
            $font = $this->expandCSSEscape($font);\r
\r
            // $font is a pure representation of the font name\r
\r
            if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {\r
                continue;\r
            }\r
\r
            if (ctype_alnum($font) && $font !== '') {\r
                // very simple font, allow it in unharmed\r
                $final .= $font . ', ';\r
                continue;\r
            }\r
\r
            // bugger out on whitespace.  form feed (0C) really\r
            // shouldn't show up regardless\r
            $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);\r
\r
            // Here, there are various classes of characters which need\r
            // to be treated differently:\r
            //  - Alphanumeric characters are essentially safe.  We\r
            //    handled these above.\r
            //  - Spaces require quoting, though most parsers will do\r
            //    the right thing if there aren't any characters that\r
            //    can be misinterpreted\r
            //  - Dashes rarely occur, but they fairly unproblematic\r
            //    for parsing/rendering purposes.\r
            //  The above characters cover the majority of Western font\r
            //  names.\r
            //  - Arbitrary Unicode characters not in ASCII.  Because\r
            //    most parsers give little thought to Unicode, treatment\r
            //    of these codepoints is basically uniform, even for\r
            //    punctuation-like codepoints.  These characters can\r
            //    show up in non-Western pages and are supported by most\r
            //    major browsers, for example: "ＭＳ 明朝" is a\r
            //    legitimate font-name\r
            //    <http://ja.wikipedia.org/wiki/MS_明朝>.  See\r
            //    the CSS3 spec for more examples:\r
            //    <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>\r
            //    You can see live samples of these on the Internet:\r
            //    <http://www.google.co.jp/search?q=font-family+ＭＳ+明朝|ゴシック>\r
            //    However, most of these fonts have ASCII equivalents:\r
            //    for example, 'MS Mincho', and it's considered\r
            //    professional to use ASCII font names instead of\r
            //    Unicode font names.  Thanks Takeshi Terada for\r
            //    providing this information.\r
            //  The following characters, to my knowledge, have not been\r
            //  used to name font names.\r
            //  - Single quote.  While theoretically you might find a\r
            //    font name that has a single quote in its name (serving\r
            //    as an apostrophe, e.g. Dave's Scribble), I haven't\r
            //    been able to find any actual examples of this.\r
            //    Internet Explorer's cssText translation (which I\r
            //    believe is invoked by innerHTML) normalizes any\r
            //    quoting to single quotes, and fails to escape single\r
            //    quotes.  (Note that this is not IE's behavior for all\r
            //    CSS properties, just some sort of special casing for\r
            //    font-family).  So a single quote *cannot* be used\r
            //    safely in the font-family context if there will be an\r
            //    innerHTML/cssText translation.  Note that Firefox 3.x\r
            //    does this too.\r
            //  - Double quote.  In IE, these get normalized to\r
            //    single-quotes, no matter what the encoding.  (Fun\r
            //    fact, in IE8, the 'content' CSS property gained\r
            //    support, where they special cased to preserve encoded\r
            //    double quotes, but still translate unadorned double\r
            //    quotes into single quotes.)  So, because their\r
            //    fixpoint behavior is identical to single quotes, they\r
            //    cannot be allowed either.  Firefox 3.x displays\r
            //    single-quote style behavior.\r
            //  - Backslashes are reduced by one (so \\ -> \) every\r
            //    iteration, so they cannot be used safely.  This shows\r
            //    up in IE7, IE8 and FF3\r
            //  - Semicolons, commas and backticks are handled properly.\r
            //  - The rest of the ASCII punctuation is handled properly.\r
            // We haven't checked what browsers do to unadorned\r
            // versions, but this is not important as long as the\r
            // browser doesn't /remove/ surrounding quotes (as IE does\r
            // for HTML).\r
            //\r
            // With these results in hand, we conclude that there are\r
            // various levels of safety:\r
            //  - Paranoid: alphanumeric, spaces and dashes(?)\r
            //  - International: Paranoid + non-ASCII Unicode\r
            //  - Edgy: Everything except quotes, backslashes\r
            //  - NoJS: Standards compliance, e.g. sod IE. Note that\r
            //    with some judicious character escaping (since certain\r
            //    types of escaping doesn't work) this is theoretically\r
            //    OK as long as innerHTML/cssText is not called.\r
            // We believe that international is a reasonable default\r
            // (that we will implement now), and once we do more\r
            // extensive research, we may feel comfortable with dropping\r
            // it down to edgy.\r
\r
            // Edgy: alphanumeric, spaces, dashes, underscores and Unicode.  Use of\r
            // str(c)spn assumes that the string was already well formed\r
            // Unicode (which of course it is).\r
            if (strspn($font, $this->mask) !== strlen($font)) {\r
                continue;\r
            }\r
\r
            // Historical:\r
            // In the absence of innerHTML/cssText, these ugly\r
            // transforms don't pose a security risk (as \\ and \"\r
            // might--these escapes are not supported by most browsers).\r
            // We could try to be clever and use single-quote wrapping\r
            // when there is a double quote present, but I have choosen\r
            // not to implement that.  (NOTE: you can reduce the amount\r
            // of escapes by one depending on what quoting style you use)\r
            // $font = str_replace('\\', '\\5C ', $font);\r
            // $font = str_replace('"',  '\\22 ', $font);\r
            // $font = str_replace("'",  '\\27 ', $font);\r
\r
            // font possibly with spaces, requires quoting\r
            $final .= "'$font', ";\r
        }\r
        $final = rtrim($final, ', ');\r
        if ($final === '') {\r
            return false;\r
        }\r
        return $final;\r
    }\r
\r
}\r
\r
// vim: et sw=4 sts=4\r
Commit	Line	Data
d4949327 NL	1	<?php\r
	2	\r
	3	/**\r
	4	* Validates a font family list according to CSS spec\r
	5	*/\r
	6	class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef\r
	7	{\r
	8	\r
	9	protected $mask = null;\r
	10	\r
	11	public function __construct()\r
	12	{\r
	13	$this->mask = '_- ';\r
	14	for ($c = 'a'; $c <= 'z'; $c++) {\r
	15	$this->mask .= $c;\r
	16	}\r
	17	for ($c = 'A'; $c <= 'Z'; $c++) {\r
	18	$this->mask .= $c;\r
	19	}\r
	20	for ($c = '0'; $c <= '9'; $c++) {\r
	21	$this->mask .= $c;\r
	22	} // cast-y, but should be fine\r
	23	// special bytes used by UTF-8\r
	24	for ($i = 0x80; $i <= 0xFF; $i++) {\r
	25	// We don't bother excluding invalid bytes in this range,\r
	26	// because the our restriction of well-formed UTF-8 will\r
	27	// prevent these from ever occurring.\r
	28	$this->mask .= chr($i);\r
	29	}\r
	30	\r
	31	/*\r
	32	PHP's internal strcspn implementation is\r
	33	O(length of string * length of mask), making it inefficient\r
	34	for large masks. However, it's still faster than\r
	35	preg_match 8)\r
	36	for (p = s1;;) {\r
	37	spanp = s2;\r
	38	do {\r
	39	if (*spanp == c \|\| p == s1_end) {\r
	40	return p - s1;\r
	41	}\r
	42	} while (spanp++ < (s2_end - 1));\r
	43	c = *++p;\r
	44	}\r
	45	*/\r
	46	// possible optimization: invert the mask.\r
	47	}\r
	48	\r
	49	/**\r
	50	* @param string $string\r
	51	* @param HTMLPurifier_Config $config\r
	52	* @param HTMLPurifier_Context $context\r
	53	* @return bool\|string\r
	54	*/\r
	55	public function validate($string, $config, $context)\r
	56	{\r
	57	static $generic_names = array(\r
	58	'serif' => true,\r
	59	'sans-serif' => true,\r
	60	'monospace' => true,\r
	61	'fantasy' => true,\r
	62	'cursive' => true\r
	63	);\r
	64	$allowed_fonts = $config->get('CSS.AllowedFonts');\r
65	\r
66	// assume that no font names contain commas in them\r
67	$fonts = explode(',', $string);\r
68	$final = '';\r
69	foreach ($fonts as $font) {\r
70	$font = trim($font);\r
71	if ($font === '') {\r
72	continue;\r
73	}\r
74	// match a generic name\r
75	if (isset($generic_names[$font])) {\r
76	if ($allowed_fonts === null \|\| isset($allowed_fonts[$font])) {\r
77	$final .= $font . ', ';\r
78	}\r
79	continue;\r
80	}\r
81	// match a quoted name\r
82	if ($font[0] === '"' \|\| $font[0] === "'") {\r
83	$length = strlen($font);\r
84	if ($length <= 2) {\r
85	continue;\r
86	}\r
87	$quote = $font[0];\r
88	if ($font[$length - 1] !== $quote) {\r
89	continue;\r
90	}\r
91	$font = substr($font, 1, $length - 2);\r
92	}\r
93	\r
94	$font = $this->expandCSSEscape($font);\r
95	\r
96	// $font is a pure representation of the font name\r
97	\r
98	if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {\r
99	continue;\r
100	}\r
101	\r
102	if (ctype_alnum($font) && $font !== '') {\r
103	// very simple font, allow it in unharmed\r
104	$final .= $font . ', ';\r
105	continue;\r
106	}\r
107	\r
108	// bugger out on whitespace. form feed (0C) really\r
109	// shouldn't show up regardless\r
110	$font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);\r
111	\r
112	// Here, there are various classes of characters which need\r
113	// to be treated differently:\r
114	// - Alphanumeric characters are essentially safe. We\r
115	// handled these above.\r
116	// - Spaces require quoting, though most parsers will do\r
117	// the right thing if there aren't any characters that\r
118	// can be misinterpreted\r
119	// - Dashes rarely occur, but they fairly unproblematic\r
120	// for parsing/rendering purposes.\r
121	// The above characters cover the majority of Western font\r
122	// names.\r
123	// - Arbitrary Unicode characters not in ASCII. Because\r
124	// most parsers give little thought to Unicode, treatment\r
125	// of these codepoints is basically uniform, even for\r
126	// punctuation-like codepoints. These characters can\r
127	// show up in non-Western pages and are supported by most\r
128	// major browsers, for example: "ＭＳ明朝" is a\r
129	// legitimate font-name\r
130	// <http://ja.wikipedia.org/wiki/MS_明朝>. See\r
131	// the CSS3 spec for more examples:\r
132	// <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>\r
133	// You can see live samples of these on the Internet:\r
134	// <http://www.google.co.jp/search?q=font-family+ＭＳ+明朝\|ゴシック>\r
135	// However, most of these fonts have ASCII equivalents:\r
136	// for example, 'MS Mincho', and it's considered\r
137	// professional to use ASCII font names instead of\r
138	// Unicode font names. Thanks Takeshi Terada for\r
139	// providing this information.\r
140	// The following characters, to my knowledge, have not been\r
141	// used to name font names.\r
142	// - Single quote. While theoretically you might find a\r
143	// font name that has a single quote in its name (serving\r
144	// as an apostrophe, e.g. Dave's Scribble), I haven't\r
145	// been able to find any actual examples of this.\r
146	// Internet Explorer's cssText translation (which I\r
147	// believe is invoked by innerHTML) normalizes any\r
148	// quoting to single quotes, and fails to escape single\r
149	// quotes. (Note that this is not IE's behavior for all\r
150	// CSS properties, just some sort of special casing for\r
151	// font-family). So a single quote cannot be used\r
152	// safely in the font-family context if there will be an\r
153	// innerHTML/cssText translation. Note that Firefox 3.x\r
154	// does this too.\r
155	// - Double quote. In IE, these get normalized to\r
156	// single-quotes, no matter what the encoding. (Fun\r
157	// fact, in IE8, the 'content' CSS property gained\r
158	// support, where they special cased to preserve encoded\r
159	// double quotes, but still translate unadorned double\r
160	// quotes into single quotes.) So, because their\r
161	// fixpoint behavior is identical to single quotes, they\r
162	// cannot be allowed either. Firefox 3.x displays\r
163	// single-quote style behavior.\r
164	// - Backslashes are reduced by one (so \\ -> \) every\r
165	// iteration, so they cannot be used safely. This shows\r
166	// up in IE7, IE8 and FF3\r
167	// - Semicolons, commas and backticks are handled properly.\r
168	// - The rest of the ASCII punctuation is handled properly.\r
169	// We haven't checked what browsers do to unadorned\r
170	// versions, but this is not important as long as the\r
171	// browser doesn't /remove/ surrounding quotes (as IE does\r
172	// for HTML).\r
173	//\r
174	// With these results in hand, we conclude that there are\r
175	// various levels of safety:\r
176	// - Paranoid: alphanumeric, spaces and dashes(?)\r
177	// - International: Paranoid + non-ASCII Unicode\r
178	// - Edgy: Everything except quotes, backslashes\r
179	// - NoJS: Standards compliance, e.g. sod IE. Note that\r
180	// with some judicious character escaping (since certain\r
181	// types of escaping doesn't work) this is theoretically\r
182	// OK as long as innerHTML/cssText is not called.\r
183	// We believe that international is a reasonable default\r
184	// (that we will implement now), and once we do more\r
185	// extensive research, we may feel comfortable with dropping\r
186	// it down to edgy.\r
187	\r
188	// Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of\r
189	// str(c)spn assumes that the string was already well formed\r
190	// Unicode (which of course it is).\r
191	if (strspn($font, $this->mask) !== strlen($font)) {\r
192	continue;\r
193	}\r
194	\r
195	// Historical:\r
196	// In the absence of innerHTML/cssText, these ugly\r
197	// transforms don't pose a security risk (as \\ and \"\r
198	// might--these escapes are not supported by most browsers).\r
199	// We could try to be clever and use single-quote wrapping\r
200	// when there is a double quote present, but I have choosen\r
201	// not to implement that. (NOTE: you can reduce the amount\r
202	// of escapes by one depending on what quoting style you use)\r
203	// $font = str_replace('\\', '\\5C ', $font);\r
204	// $font = str_replace('"', '\\22 ', $font);\r
205	// $font = str_replace("'", '\\27 ', $font);\r
206	\r
207	// font possibly with spaces, requires quoting\r
208	$final .= "'$font', ";\r
209	}\r
210	$final = rtrim($final, ', ');\r
211	if ($final === '') {\r
212	return false;\r
213	}\r
214	return $final;\r
215	}\r
216	\r
217	}\r
218	\r
219	// vim: et sw=4 sts=4\r