]> git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/Encoding.php
Merge pull request #67 from inthepoche/dev
[github/wallabag/wallabag.git] / inc / Encoding.php
1 <?php
2 /**
3 * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
4 * @package Encoding
5 * @version 1.1
6 * @link http://www.framework2.com.ar/dzone/forceUTF8-es/
7 * @example http://www.framework2.com.ar/dzone/forceUTF8-es/
8 */
9
10 class Encoding {
11
12 protected static $win1252ToUtf8 = array(
13 128 => "\xe2\x82\xac",
14
15 130 => "\xe2\x80\x9a",
16 131 => "\xc6\x92",
17 132 => "\xe2\x80\x9e",
18 133 => "\xe2\x80\xa6",
19 134 => "\xe2\x80\xa0",
20 135 => "\xe2\x80\xa1",
21 136 => "\xcb\x86",
22 137 => "\xe2\x80\xb0",
23 138 => "\xc5\xa0",
24 139 => "\xe2\x80\xb9",
25 140 => "\xc5\x92",
26
27 142 => "\xc5\xbd",
28
29
30 145 => "\xe2\x80\x98",
31 146 => "\xe2\x80\x99",
32 147 => "\xe2\x80\x9c",
33 148 => "\xe2\x80\x9d",
34 149 => "\xe2\x80\xa2",
35 150 => "\xe2\x80\x93",
36 151 => "\xe2\x80\x94",
37 152 => "\xcb\x9c",
38 153 => "\xe2\x84\xa2",
39 154 => "\xc5\xa1",
40 155 => "\xe2\x80\xba",
41 156 => "\xc5\x93",
42
43 158 => "\xc5\xbe",
44 159 => "\xc5\xb8"
45 );
46
47 protected static $brokenUtf8ToUtf8 = array(
48 "\xc2\x80" => "\xe2\x82\xac",
49
50 "\xc2\x82" => "\xe2\x80\x9a",
51 "\xc2\x83" => "\xc6\x92",
52 "\xc2\x84" => "\xe2\x80\x9e",
53 "\xc2\x85" => "\xe2\x80\xa6",
54 "\xc2\x86" => "\xe2\x80\xa0",
55 "\xc2\x87" => "\xe2\x80\xa1",
56 "\xc2\x88" => "\xcb\x86",
57 "\xc2\x89" => "\xe2\x80\xb0",
58 "\xc2\x8a" => "\xc5\xa0",
59 "\xc2\x8b" => "\xe2\x80\xb9",
60 "\xc2\x8c" => "\xc5\x92",
61
62 "\xc2\x8e" => "\xc5\xbd",
63
64
65 "\xc2\x91" => "\xe2\x80\x98",
66 "\xc2\x92" => "\xe2\x80\x99",
67 "\xc2\x93" => "\xe2\x80\x9c",
68 "\xc2\x94" => "\xe2\x80\x9d",
69 "\xc2\x95" => "\xe2\x80\xa2",
70 "\xc2\x96" => "\xe2\x80\x93",
71 "\xc2\x97" => "\xe2\x80\x94",
72 "\xc2\x98" => "\xcb\x9c",
73 "\xc2\x99" => "\xe2\x84\xa2",
74 "\xc2\x9a" => "\xc5\xa1",
75 "\xc2\x9b" => "\xe2\x80\xba",
76 "\xc2\x9c" => "\xc5\x93",
77
78 "\xc2\x9e" => "\xc5\xbe",
79 "\xc2\x9f" => "\xc5\xb8"
80 );
81
82 protected static $utf8ToWin1252 = array(
83 "\xe2\x82\xac" => "\x80",
84
85 "\xe2\x80\x9a" => "\x82",
86 "\xc6\x92" => "\x83",
87 "\xe2\x80\x9e" => "\x84",
88 "\xe2\x80\xa6" => "\x85",
89 "\xe2\x80\xa0" => "\x86",
90 "\xe2\x80\xa1" => "\x87",
91 "\xcb\x86" => "\x88",
92 "\xe2\x80\xb0" => "\x89",
93 "\xc5\xa0" => "\x8a",
94 "\xe2\x80\xb9" => "\x8b",
95 "\xc5\x92" => "\x8c",
96
97 "\xc5\xbd" => "\x8e",
98
99
100 "\xe2\x80\x98" => "\x91",
101 "\xe2\x80\x99" => "\x92",
102 "\xe2\x80\x9c" => "\x93",
103 "\xe2\x80\x9d" => "\x94",
104 "\xe2\x80\xa2" => "\x95",
105 "\xe2\x80\x93" => "\x96",
106 "\xe2\x80\x94" => "\x97",
107 "\xcb\x9c" => "\x98",
108 "\xe2\x84\xa2" => "\x99",
109 "\xc5\xa1" => "\x9a",
110 "\xe2\x80\xba" => "\x9b",
111 "\xc5\x93" => "\x9c",
112
113 "\xc5\xbe" => "\x9e",
114 "\xc5\xb8" => "\x9f"
115 );
116
117 static function toUTF8($text){
118 /**
119 * Function Encoding::toUTF8
120 *
121 * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
122 *
123 * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
124 *
125 * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
126 *
127 * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
128 * are followed by any of these: ("group B")
129 * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
130 * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
131 * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
132 * is also a valid unicode character, and will be left unchanged.
133 *
134 * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
135 * 3) when any of these: ðñòó are followed by THREE chars from group B.
136 *
137 * @name toUTF8
138 * @param string $text Any string.
139 * @return string The same string, UTF8 encoded
140 *
141 */
142
143 if(is_array($text))
144 {
145 foreach($text as $k => $v)
146 {
147 $text[$k] = self::toUTF8($v);
148 }
149 return $text;
150 } elseif(is_string($text)) {
151
152 $max = strlen($text);
153 $buf = "";
154 for($i = 0; $i < $max; $i++){
155 $c1 = $text{$i};
156 if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
157 $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
158 $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
159 $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
160 if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
161 if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
162 $buf .= $c1 . $c2;
163 $i++;
164 } else { //not valid UTF8. Convert it.
165 $cc1 = (chr(ord($c1) / 64) | "\xc0");
166 $cc2 = ($c1 & "\x3f") | "\x80";
167 $buf .= $cc1 . $cc2;
168 }
169 } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
170 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
171 $buf .= $c1 . $c2 . $c3;
172 $i = $i + 2;
173 } else { //not valid UTF8. Convert it.
174 $cc1 = (chr(ord($c1) / 64) | "\xc0");
175 $cc2 = ($c1 & "\x3f") | "\x80";
176 $buf .= $cc1 . $cc2;
177 }
178 } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
179 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
180 $buf .= $c1 . $c2 . $c3;
181 $i = $i + 2;
182 } else { //not valid UTF8. Convert it.
183 $cc1 = (chr(ord($c1) / 64) | "\xc0");
184 $cc2 = ($c1 & "\x3f") | "\x80";
185 $buf .= $cc1 . $cc2;
186 }
187 } else { //doesn't look like UTF8, but should be converted
188 $cc1 = (chr(ord($c1) / 64) | "\xc0");
189 $cc2 = (($c1 & "\x3f") | "\x80");
190 $buf .= $cc1 . $cc2;
191 }
192 } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
193 if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
194 $buf .= self::$win1252ToUtf8[ord($c1)];
195 } else {
196 $cc1 = (chr(ord($c1) / 64) | "\xc0");
197 $cc2 = (($c1 & "\x3f") | "\x80");
198 $buf .= $cc1 . $cc2;
199 }
200 } else { // it doesn't need convesion
201 $buf .= $c1;
202 }
203 }
204 return $buf;
205 } else {
206 return $text;
207 }
208 }
209
210 static function toWin1252($text) {
211 if(is_array($text)) {
212 foreach($text as $k => $v) {
213 $text[$k] = self::toWin1252($v);
214 }
215 return $text;
216 } elseif(is_string($text)) {
217 return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
218 } else {
219 return $text;
220 }
221 }
222
223 static function toISO8859($text) {
224 return self::toWin1252($text);
225 }
226
227 static function toLatin1($text) {
228 return self::toWin1252($text);
229 }
230
231 static function fixUTF8($text){
232 if(is_array($text)) {
233 foreach($text as $k => $v) {
234 $text[$k] = self::fixUTF8($v);
235 }
236 return $text;
237 }
238
239 $last = "";
240 while($last <> $text){
241 $last = $text;
242 $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
243 }
244 $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
245 return $text;
246 }
247
248 static function UTF8FixWin1252Chars($text){
249 // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
250 // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
251 // See: http://en.wikipedia.org/wiki/Windows-1252
252
253 return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
254 }
255
256 static function removeBOM($str=""){
257 if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
258 $str=substr($str, 3);
259 }
260 return $str;
261 }
262 }