]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/Encoding.php
3 * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
6 * @link http://www.framework2.com.ar/dzone/forceUTF8-es/
7 * @example http://www.framework2.com.ar/dzone/forceUTF8-es/
12 protected static $win1252ToUtf8 = array(
13 128 => "\xe2\x82\xac",
15 130 => "\xe2\x80\x9a",
17 132 => "\xe2\x80\x9e",
18 133 => "\xe2\x80\xa6",
19 134 => "\xe2\x80\xa0",
20 135 => "\xe2\x80\xa1",
22 137 => "\xe2\x80\xb0",
24 139 => "\xe2\x80\xb9",
30 145 => "\xe2\x80\x98",
31 146 => "\xe2\x80\x99",
32 147 => "\xe2\x80\x9c",
33 148 => "\xe2\x80\x9d",
34 149 => "\xe2\x80\xa2",
35 150 => "\xe2\x80\x93",
36 151 => "\xe2\x80\x94",
38 153 => "\xe2\x84\xa2",
40 155 => "\xe2\x80\xba",
47 protected static $brokenUtf8ToUtf8 = array(
48 "\xc2\x80" => "\xe2\x82\xac",
50 "\xc2\x82" => "\xe2\x80\x9a",
51 "\xc2\x83" => "\xc6\x92",
52 "\xc2\x84" => "\xe2\x80\x9e",
53 "\xc2\x85" => "\xe2\x80\xa6",
54 "\xc2\x86" => "\xe2\x80\xa0",
55 "\xc2\x87" => "\xe2\x80\xa1",
56 "\xc2\x88" => "\xcb\x86",
57 "\xc2\x89" => "\xe2\x80\xb0",
58 "\xc2\x8a" => "\xc5\xa0",
59 "\xc2\x8b" => "\xe2\x80\xb9",
60 "\xc2\x8c" => "\xc5\x92",
62 "\xc2\x8e" => "\xc5\xbd",
65 "\xc2\x91" => "\xe2\x80\x98",
66 "\xc2\x92" => "\xe2\x80\x99",
67 "\xc2\x93" => "\xe2\x80\x9c",
68 "\xc2\x94" => "\xe2\x80\x9d",
69 "\xc2\x95" => "\xe2\x80\xa2",
70 "\xc2\x96" => "\xe2\x80\x93",
71 "\xc2\x97" => "\xe2\x80\x94",
72 "\xc2\x98" => "\xcb\x9c",
73 "\xc2\x99" => "\xe2\x84\xa2",
74 "\xc2\x9a" => "\xc5\xa1",
75 "\xc2\x9b" => "\xe2\x80\xba",
76 "\xc2\x9c" => "\xc5\x93",
78 "\xc2\x9e" => "\xc5\xbe",
79 "\xc2\x9f" => "\xc5\xb8"
82 protected static $utf8ToWin1252 = array(
83 "\xe2\x82\xac" => "\x80",
85 "\xe2\x80\x9a" => "\x82",
87 "\xe2\x80\x9e" => "\x84",
88 "\xe2\x80\xa6" => "\x85",
89 "\xe2\x80\xa0" => "\x86",
90 "\xe2\x80\xa1" => "\x87",
92 "\xe2\x80\xb0" => "\x89",
94 "\xe2\x80\xb9" => "\x8b",
100 "\xe2\x80\x98" => "\x91",
101 "\xe2\x80\x99" => "\x92",
102 "\xe2\x80\x9c" => "\x93",
103 "\xe2\x80\x9d" => "\x94",
104 "\xe2\x80\xa2" => "\x95",
105 "\xe2\x80\x93" => "\x96",
106 "\xe2\x80\x94" => "\x97",
107 "\xcb\x9c" => "\x98",
108 "\xe2\x84\xa2" => "\x99",
109 "\xc5\xa1" => "\x9a",
110 "\xe2\x80\xba" => "\x9b",
111 "\xc5\x93" => "\x9c",
113 "\xc5\xbe" => "\x9e",
117 static function toUTF8($text){
119 * Function Encoding::toUTF8
121 * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
123 * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
125 * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
127 * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
128 * are followed by any of these: ("group B")
129 * ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
130 * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
131 * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
132 * is also a valid unicode character, and will be left unchanged.
134 * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
135 * 3) when any of these: ðñòó are followed by THREE chars from group B.
138 * @param string $text Any string.
139 * @return string The same string, UTF8 encoded
145 foreach($text as $k => $v)
147 $text[$k] = self
::toUTF8($v);
150 } elseif(is_string($text)) {
152 $max = strlen($text);
154 for($i = 0; $i < $max; $i++
){
156 if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
157 $c2 = $i+
1 >= $max? "\x00" : $text{$i+
1};
158 $c3 = $i+
2 >= $max? "\x00" : $text{$i+
2};
159 $c4 = $i+
3 >= $max? "\x00" : $text{$i+
3};
160 if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
161 if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
164 } else { //not valid UTF8. Convert it.
165 $cc1 = (chr(ord($c1) / 64) | "\xc0");
166 $cc2 = ($c1 & "\x3f") | "\x80";
169 } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
170 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
171 $buf .= $c1 . $c2 . $c3;
173 } else { //not valid UTF8. Convert it.
174 $cc1 = (chr(ord($c1) / 64) | "\xc0");
175 $cc2 = ($c1 & "\x3f") | "\x80";
178 } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
179 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
180 $buf .= $c1 . $c2 . $c3;
182 } else { //not valid UTF8. Convert it.
183 $cc1 = (chr(ord($c1) / 64) | "\xc0");
184 $cc2 = ($c1 & "\x3f") | "\x80";
187 } else { //doesn't look like UTF8, but should be converted
188 $cc1 = (chr(ord($c1) / 64) | "\xc0");
189 $cc2 = (($c1 & "\x3f") | "\x80");
192 } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
193 if(isset(self
::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
194 $buf .= self
::$win1252ToUtf8[ord($c1)];
196 $cc1 = (chr(ord($c1) / 64) | "\xc0");
197 $cc2 = (($c1 & "\x3f") | "\x80");
200 } else { // it doesn't need convesion
210 static function toWin1252($text) {
211 if(is_array($text)) {
212 foreach($text as $k => $v) {
213 $text[$k] = self
::toWin1252($v);
216 } elseif(is_string($text)) {
217 return utf8_decode(str_replace(array_keys(self
::$utf8ToWin1252), array_values(self
::$utf8ToWin1252), self
::toUTF8($text)));
223 static function toISO8859($text) {
224 return self
::toWin1252($text);
227 static function toLatin1($text) {
228 return self
::toWin1252($text);
231 static function fixUTF8($text){
232 if(is_array($text)) {
233 foreach($text as $k => $v) {
234 $text[$k] = self
::fixUTF8($v);
240 while($last <> $text){
242 $text = self
::toUTF8(utf8_decode(str_replace(array_keys(self
::$utf8ToWin1252), array_values(self
::$utf8ToWin1252), $text)));
244 $text = self
::toUTF8(utf8_decode(str_replace(array_keys(self
::$utf8ToWin1252), array_values(self
::$utf8ToWin1252), $text)));
248 static function UTF8FixWin1252Chars($text){
249 // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
250 // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
251 // See: http://en.wikipedia.org/wiki/Windows-1252
253 return str_replace(array_keys(self
::$brokenUtf8ToUtf8), array_values(self
::$brokenUtf8ToUtf8), $text);
256 static function removeBOM($str=""){
257 if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
258 $str=substr($str, 3);