]>
Commit | Line | Data |
---|---|---|
42c80841 NL |
1 | <?php |
2 | ||
3 | // warning: this file is encoded in UTF-8! | |
4 | ||
5 | class HTML5_Data | |
6 | { | |
7 | ||
8 | // at some point this should be moved to a .ser file. Another | |
9 | // possible optimization is to give UTF-8 bytes, not Unicode | |
10 | // codepoints | |
11 | // XXX: Not quite sure why it's named this; this is | |
12 | // actually the numeric entity dereference table. | |
13 | protected static $realCodepointTable = array( | |
14 | 0x00 => 0xFFFD, // REPLACEMENT CHARACTER | |
15 | 0x0D => 0x000A, // LINE FEED (LF) | |
16 | 0x80 => 0x20AC, // EURO SIGN ('€') | |
17 | 0x81 => 0x0081, // <control> | |
18 | 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') | |
19 | 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') | |
20 | 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') | |
21 | 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') | |
22 | 0x86 => 0x2020, // DAGGER ('†') | |
23 | 0x87 => 0x2021, // DOUBLE DAGGER ('‡') | |
24 | 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') | |
25 | 0x89 => 0x2030, // PER MILLE SIGN ('‰') | |
26 | 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') | |
27 | 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') | |
28 | 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') | |
29 | 0x8D => 0x008D, // <control> | |
30 | 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') | |
31 | 0x8F => 0x008F, // <control> | |
32 | 0x90 => 0x0090, // <control> | |
33 | 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') | |
34 | 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') | |
35 | 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') | |
36 | 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') | |
37 | 0x95 => 0x2022, // BULLET ('•') | |
38 | 0x96 => 0x2013, // EN DASH ('–') | |
39 | 0x97 => 0x2014, // EM DASH ('—') | |
40 | 0x98 => 0x02DC, // SMALL TILDE ('˜') | |
41 | 0x99 => 0x2122, // TRADE MARK SIGN ('™') | |
42 | 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') | |
43 | 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') | |
44 | 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') | |
45 | 0x9D => 0x009D, // <control> | |
46 | 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') | |
47 | 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') | |
48 | ); | |
49 | ||
50 | protected static $namedCharacterReferences; | |
51 | ||
52 | protected static $namedCharacterReferenceMaxLength; | |
53 | ||
54 | /** | |
55 | * Returns the "real" Unicode codepoint of a malformed character | |
56 | * reference. | |
57 | */ | |
58 | public static function getRealCodepoint($ref) { | |
59 | if (!isset(self::$realCodepointTable[$ref])) return false; | |
60 | else return self::$realCodepointTable[$ref]; | |
61 | } | |
62 | ||
63 | public static function getNamedCharacterReferences() { | |
64 | if (!self::$namedCharacterReferences) { | |
65 | self::$namedCharacterReferences = unserialize( | |
66 | file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); | |
67 | } | |
68 | return self::$namedCharacterReferences; | |
69 | } | |
70 | ||
71 | /** | |
72 | * Converts a Unicode codepoint to sequence of UTF-8 bytes. | |
73 | * @note Shamelessly stolen from HTML Purifier, which is also | |
74 | * shamelessly stolen from Feyd (which is in public domain). | |
75 | */ | |
76 | public static function utf8chr($code) { | |
77 | /* We don't care: we live dangerously | |
78 | * if($code > 0x10FFFF or $code < 0x0 or | |
79 | ($code >= 0xD800 and $code <= 0xDFFF) ) { | |
80 | // bits are set outside the "valid" range as defined | |
81 | // by UNICODE 4.1.0 | |
82 | return "\xEF\xBF\xBD"; | |
83 | }*/ | |
84 | ||
85 | $x = $y = $z = $w = 0; | |
86 | if ($code < 0x80) { | |
87 | // regular ASCII character | |
88 | $x = $code; | |
89 | } else { | |
90 | // set up bits for UTF-8 | |
91 | $x = ($code & 0x3F) | 0x80; | |
92 | if ($code < 0x800) { | |
93 | $y = (($code & 0x7FF) >> 6) | 0xC0; | |
94 | } else { | |
95 | $y = (($code & 0xFC0) >> 6) | 0x80; | |
96 | if($code < 0x10000) { | |
97 | $z = (($code >> 12) & 0x0F) | 0xE0; | |
98 | } else { | |
99 | $z = (($code >> 12) & 0x3F) | 0x80; | |
100 | $w = (($code >> 18) & 0x07) | 0xF0; | |
101 | } | |
102 | } | |
103 | } | |
104 | // set up the actual character | |
105 | $ret = ''; | |
106 | if($w) $ret .= chr($w); | |
107 | if($z) $ret .= chr($z); | |
108 | if($y) $ret .= chr($y); | |
109 | $ret .= chr($x); | |
110 | ||
111 | return $ret; | |
112 | } | |
113 | ||
114 | } |