diff options
author | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
---|---|---|
committer | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
commit | 7f667839764621b5aa01c9db8ce5dde2a29ef18f (patch) | |
tree | 93d8241ee81c87e18494325ae02f0589a8e328a2 /inc/3rdparty/libraries/html5/Data.php | |
parent | a84f77d6ba15a64ff00453f5d5190c021ce460ed (diff) | |
parent | 2abcccb37180c17318f5226f5d4bc28f30b621ea (diff) | |
download | wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.gz wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.zst wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.zip |
Merge pull request #1 from inthepoche/dev
Dev
Diffstat (limited to 'inc/3rdparty/libraries/html5/Data.php')
-rw-r--r-- | inc/3rdparty/libraries/html5/Data.php | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/html5/Data.php b/inc/3rdparty/libraries/html5/Data.php new file mode 100644 index 00000000..497345f4 --- /dev/null +++ b/inc/3rdparty/libraries/html5/Data.php | |||
@@ -0,0 +1,114 @@ | |||
1 | <?php | ||
2 | |||
3 | // warning: this file is encoded in UTF-8! | ||
4 | |||
5 | class HTML5_Data | ||
6 | { | ||
7 | |||
8 | // at some point this should be moved to a .ser file. Another | ||
9 | // possible optimization is to give UTF-8 bytes, not Unicode | ||
10 | // codepoints | ||
11 | // XXX: Not quite sure why it's named this; this is | ||
12 | // actually the numeric entity dereference table. | ||
13 | protected static $realCodepointTable = array( | ||
14 | 0x00 => 0xFFFD, // REPLACEMENT CHARACTER | ||
15 | 0x0D => 0x000A, // LINE FEED (LF) | ||
16 | 0x80 => 0x20AC, // EURO SIGN ('€') | ||
17 | 0x81 => 0x0081, // <control> | ||
18 | 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') | ||
19 | 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') | ||
20 | 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') | ||
21 | 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') | ||
22 | 0x86 => 0x2020, // DAGGER ('†') | ||
23 | 0x87 => 0x2021, // DOUBLE DAGGER ('‡') | ||
24 | 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') | ||
25 | 0x89 => 0x2030, // PER MILLE SIGN ('‰') | ||
26 | 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') | ||
27 | 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') | ||
28 | 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') | ||
29 | 0x8D => 0x008D, // <control> | ||
30 | 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') | ||
31 | 0x8F => 0x008F, // <control> | ||
32 | 0x90 => 0x0090, // <control> | ||
33 | 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') | ||
34 | 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') | ||
35 | 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') | ||
36 | 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') | ||
37 | 0x95 => 0x2022, // BULLET ('•') | ||
38 | 0x96 => 0x2013, // EN DASH ('–') | ||
39 | 0x97 => 0x2014, // EM DASH ('—') | ||
40 | 0x98 => 0x02DC, // SMALL TILDE ('˜') | ||
41 | 0x99 => 0x2122, // TRADE MARK SIGN ('™') | ||
42 | 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') | ||
43 | 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') | ||
44 | 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') | ||
45 | 0x9D => 0x009D, // <control> | ||
46 | 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') | ||
47 | 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') | ||
48 | ); | ||
49 | |||
50 | protected static $namedCharacterReferences; | ||
51 | |||
52 | protected static $namedCharacterReferenceMaxLength; | ||
53 | |||
54 | /** | ||
55 | * Returns the "real" Unicode codepoint of a malformed character | ||
56 | * reference. | ||
57 | */ | ||
58 | public static function getRealCodepoint($ref) { | ||
59 | if (!isset(self::$realCodepointTable[$ref])) return false; | ||
60 | else return self::$realCodepointTable[$ref]; | ||
61 | } | ||
62 | |||
63 | public static function getNamedCharacterReferences() { | ||
64 | if (!self::$namedCharacterReferences) { | ||
65 | self::$namedCharacterReferences = unserialize( | ||
66 | file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); | ||
67 | } | ||
68 | return self::$namedCharacterReferences; | ||
69 | } | ||
70 | |||
71 | /** | ||
72 | * Converts a Unicode codepoint to sequence of UTF-8 bytes. | ||
73 | * @note Shamelessly stolen from HTML Purifier, which is also | ||
74 | * shamelessly stolen from Feyd (which is in public domain). | ||
75 | */ | ||
76 | public static function utf8chr($code) { | ||
77 | /* We don't care: we live dangerously | ||
78 | * if($code > 0x10FFFF or $code < 0x0 or | ||
79 | ($code >= 0xD800 and $code <= 0xDFFF) ) { | ||
80 | // bits are set outside the "valid" range as defined | ||
81 | // by UNICODE 4.1.0 | ||
82 | return "\xEF\xBF\xBD"; | ||
83 | }*/ | ||
84 | |||
85 | $x = $y = $z = $w = 0; | ||
86 | if ($code < 0x80) { | ||
87 | // regular ASCII character | ||
88 | $x = $code; | ||
89 | } else { | ||
90 | // set up bits for UTF-8 | ||
91 | $x = ($code & 0x3F) | 0x80; | ||
92 | if ($code < 0x800) { | ||
93 | $y = (($code & 0x7FF) >> 6) | 0xC0; | ||
94 | } else { | ||
95 | $y = (($code & 0xFC0) >> 6) | 0x80; | ||
96 | if($code < 0x10000) { | ||
97 | $z = (($code >> 12) & 0x0F) | 0xE0; | ||
98 | } else { | ||
99 | $z = (($code >> 12) & 0x3F) | 0x80; | ||
100 | $w = (($code >> 18) & 0x07) | 0xF0; | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | // set up the actual character | ||
105 | $ret = ''; | ||
106 | if($w) $ret .= chr($w); | ||
107 | if($z) $ret .= chr($z); | ||
108 | if($y) $ret .= chr($y); | ||
109 | $ret .= chr($x); | ||
110 | |||
111 | return $ret; | ||
112 | } | ||
113 | |||
114 | } | ||