]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/libraries/html5/Data.php
htmlawed via composer
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / html5 / Data.php
CommitLineData
42c80841
NL
1<?php
2
3// warning: this file is encoded in UTF-8!
4
5class HTML5_Data
6{
7
8 // at some point this should be moved to a .ser file. Another
9 // possible optimization is to give UTF-8 bytes, not Unicode
10 // codepoints
11 // XXX: Not quite sure why it's named this; this is
12 // actually the numeric entity dereference table.
13 protected static $realCodepointTable = array(
14 0x00 => 0xFFFD, // REPLACEMENT CHARACTER
15 0x0D => 0x000A, // LINE FEED (LF)
16 0x80 => 0x20AC, // EURO SIGN ('€')
17 0x81 => 0x0081, // <control>
18 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
19 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
20 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
21 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
22 0x86 => 0x2020, // DAGGER ('†')
23 0x87 => 0x2021, // DOUBLE DAGGER ('‡')
24 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
25 0x89 => 0x2030, // PER MILLE SIGN ('‰')
26 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
27 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
28 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
29 0x8D => 0x008D, // <control>
30 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
31 0x8F => 0x008F, // <control>
32 0x90 => 0x0090, // <control>
33 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
34 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
35 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
36 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
37 0x95 => 0x2022, // BULLET ('•')
38 0x96 => 0x2013, // EN DASH ('–')
39 0x97 => 0x2014, // EM DASH ('—')
40 0x98 => 0x02DC, // SMALL TILDE ('˜')
41 0x99 => 0x2122, // TRADE MARK SIGN ('™')
42 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
43 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
44 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
45 0x9D => 0x009D, // <control>
46 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
47 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
48 );
49
50 protected static $namedCharacterReferences;
51
52 protected static $namedCharacterReferenceMaxLength;
53
54 /**
55 * Returns the "real" Unicode codepoint of a malformed character
56 * reference.
57 */
58 public static function getRealCodepoint($ref) {
59 if (!isset(self::$realCodepointTable[$ref])) return false;
60 else return self::$realCodepointTable[$ref];
61 }
62
63 public static function getNamedCharacterReferences() {
64 if (!self::$namedCharacterReferences) {
65 self::$namedCharacterReferences = unserialize(
66 file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
67 }
68 return self::$namedCharacterReferences;
69 }
70
71 /**
72 * Converts a Unicode codepoint to sequence of UTF-8 bytes.
73 * @note Shamelessly stolen from HTML Purifier, which is also
74 * shamelessly stolen from Feyd (which is in public domain).
75 */
76 public static function utf8chr($code) {
77 /* We don't care: we live dangerously
78 * if($code > 0x10FFFF or $code < 0x0 or
79 ($code >= 0xD800 and $code <= 0xDFFF) ) {
80 // bits are set outside the "valid" range as defined
81 // by UNICODE 4.1.0
82 return "\xEF\xBF\xBD";
83 }*/
84
85 $x = $y = $z = $w = 0;
86 if ($code < 0x80) {
87 // regular ASCII character
88 $x = $code;
89 } else {
90 // set up bits for UTF-8
91 $x = ($code & 0x3F) | 0x80;
92 if ($code < 0x800) {
93 $y = (($code & 0x7FF) >> 6) | 0xC0;
94 } else {
95 $y = (($code & 0xFC0) >> 6) | 0x80;
96 if($code < 0x10000) {
97 $z = (($code >> 12) & 0x0F) | 0xE0;
98 } else {
99 $z = (($code >> 12) & 0x3F) | 0x80;
100 $w = (($code >> 18) & 0x07) | 0xF0;
101 }
102 }
103 }
104 // set up the actual character
105 $ret = '';
106 if($w) $ret .= chr($w);
107 if($z) $ret .= chr($z);
108 if($y) $ret .= chr($y);
109 $ret .= chr($x);
110
111 return $ret;
112 }
113
114}