]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/Encoding.php
Add SHAARLI support to view template
[github/wallabag/wallabag.git] / inc / 3rdparty / Encoding.php
CommitLineData
c8bbe19b 1<?php\r
1a268ba7
NL
2/**\r
3 * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>\r
4 * @package Encoding\r
5 * @version 1.1\r
6 * @link http://www.framework2.com.ar/dzone/forceUTF8-es/\r
7 * @example http://www.framework2.com.ar/dzone/forceUTF8-es/\r
8 */\r
9\r
10class Encoding {\r
c8bbe19b 11\r
1a268ba7
NL
12 protected static $win1252ToUtf8 = array(\r
13 128 => "\xe2\x82\xac",\r
14\r
15 130 => "\xe2\x80\x9a",\r
16 131 => "\xc6\x92",\r
17 132 => "\xe2\x80\x9e",\r
18 133 => "\xe2\x80\xa6",\r
19 134 => "\xe2\x80\xa0",\r
20 135 => "\xe2\x80\xa1",\r
21 136 => "\xcb\x86",\r
22 137 => "\xe2\x80\xb0",\r
23 138 => "\xc5\xa0",\r
24 139 => "\xe2\x80\xb9",\r
25 140 => "\xc5\x92",\r
26\r
27 142 => "\xc5\xbd",\r
28\r
29\r
30 145 => "\xe2\x80\x98",\r
31 146 => "\xe2\x80\x99",\r
32 147 => "\xe2\x80\x9c",\r
33 148 => "\xe2\x80\x9d",\r
34 149 => "\xe2\x80\xa2",\r
35 150 => "\xe2\x80\x93",\r
36 151 => "\xe2\x80\x94",\r
37 152 => "\xcb\x9c",\r
38 153 => "\xe2\x84\xa2",\r
39 154 => "\xc5\xa1",\r
40 155 => "\xe2\x80\xba",\r
41 156 => "\xc5\x93",\r
42\r
43 158 => "\xc5\xbe",\r
44 159 => "\xc5\xb8"\r
45 );\r
c8bbe19b 46\r
1a268ba7
NL
47 protected static $brokenUtf8ToUtf8 = array(\r
48 "\xc2\x80" => "\xe2\x82\xac",\r
c8bbe19b 49\r
1a268ba7
NL
50 "\xc2\x82" => "\xe2\x80\x9a",\r
51 "\xc2\x83" => "\xc6\x92",\r
52 "\xc2\x84" => "\xe2\x80\x9e",\r
53 "\xc2\x85" => "\xe2\x80\xa6",\r
54 "\xc2\x86" => "\xe2\x80\xa0",\r
55 "\xc2\x87" => "\xe2\x80\xa1",\r
56 "\xc2\x88" => "\xcb\x86",\r
57 "\xc2\x89" => "\xe2\x80\xb0",\r
58 "\xc2\x8a" => "\xc5\xa0",\r
59 "\xc2\x8b" => "\xe2\x80\xb9",\r
60 "\xc2\x8c" => "\xc5\x92",\r
c8bbe19b 61\r
1a268ba7 62 "\xc2\x8e" => "\xc5\xbd",\r
c8bbe19b 63\r
64\r
1a268ba7
NL
65 "\xc2\x91" => "\xe2\x80\x98",\r
66 "\xc2\x92" => "\xe2\x80\x99",\r
67 "\xc2\x93" => "\xe2\x80\x9c",\r
68 "\xc2\x94" => "\xe2\x80\x9d",\r
69 "\xc2\x95" => "\xe2\x80\xa2",\r
70 "\xc2\x96" => "\xe2\x80\x93",\r
71 "\xc2\x97" => "\xe2\x80\x94",\r
72 "\xc2\x98" => "\xcb\x9c",\r
73 "\xc2\x99" => "\xe2\x84\xa2",\r
74 "\xc2\x9a" => "\xc5\xa1",\r
75 "\xc2\x9b" => "\xe2\x80\xba",\r
76 "\xc2\x9c" => "\xc5\x93",\r
c8bbe19b 77\r
1a268ba7
NL
78 "\xc2\x9e" => "\xc5\xbe",\r
79 "\xc2\x9f" => "\xc5\xb8"\r
80 );\r
c8bbe19b 81\r
1a268ba7
NL
82 protected static $utf8ToWin1252 = array(\r
83 "\xe2\x82\xac" => "\x80",\r
c8bbe19b 84\r
1a268ba7
NL
85 "\xe2\x80\x9a" => "\x82",\r
86 "\xc6\x92" => "\x83",\r
87 "\xe2\x80\x9e" => "\x84",\r
88 "\xe2\x80\xa6" => "\x85",\r
89 "\xe2\x80\xa0" => "\x86",\r
90 "\xe2\x80\xa1" => "\x87",\r
91 "\xcb\x86" => "\x88",\r
92 "\xe2\x80\xb0" => "\x89",\r
93 "\xc5\xa0" => "\x8a",\r
94 "\xe2\x80\xb9" => "\x8b",\r
95 "\xc5\x92" => "\x8c",\r
c8bbe19b 96\r
1a268ba7 97 "\xc5\xbd" => "\x8e",\r
c8bbe19b 98\r
99\r
1a268ba7
NL
100 "\xe2\x80\x98" => "\x91",\r
101 "\xe2\x80\x99" => "\x92",\r
102 "\xe2\x80\x9c" => "\x93",\r
103 "\xe2\x80\x9d" => "\x94",\r
104 "\xe2\x80\xa2" => "\x95",\r
105 "\xe2\x80\x93" => "\x96",\r
106 "\xe2\x80\x94" => "\x97",\r
107 "\xcb\x9c" => "\x98",\r
108 "\xe2\x84\xa2" => "\x99",\r
109 "\xc5\xa1" => "\x9a",\r
110 "\xe2\x80\xba" => "\x9b",\r
111 "\xc5\x93" => "\x9c",\r
c8bbe19b 112\r
1a268ba7
NL
113 "\xc5\xbe" => "\x9e",\r
114 "\xc5\xb8" => "\x9f"\r
115 );\r
116\r
117 static function toUTF8($text){\r
118 /**\r
119 * Function Encoding::toUTF8\r
120 *\r
121 * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.\r
c8bbe19b 122 *\r
1a268ba7
NL
123 * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.\r
124 *\r
125 * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:\r
126 *\r
127 * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß\r
128 * are followed by any of these: ("group B")\r
129 * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿\r
130 * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»\r
c8bbe19b 131 * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)\r
1a268ba7
NL
132 * is also a valid unicode character, and will be left unchanged.\r
133 *\r
134 * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,\r
135 * 3) when any of these: ðñòó are followed by THREE chars from group B.\r
136 *\r
137 * @name toUTF8\r
138 * @param string $text Any string.\r
139 * @return string The same string, UTF8 encoded\r
140 *\r
141 */\r
142\r
143 if(is_array($text))\r
144 {\r
145 foreach($text as $k => $v)\r
146 {\r
147 $text[$k] = self::toUTF8($v);\r
148 }\r
149 return $text;\r
150 } elseif(is_string($text)) {\r
c8bbe19b 151\r
1a268ba7
NL
152 $max = strlen($text);\r
153 $buf = "";\r
154 for($i = 0; $i < $max; $i++){\r
155 $c1 = $text{$i};\r
156 if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already\r
157 $c2 = $i+1 >= $max? "\x00" : $text{$i+1};\r
158 $c3 = $i+2 >= $max? "\x00" : $text{$i+2};\r
159 $c4 = $i+3 >= $max? "\x00" : $text{$i+3};\r
160 if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8\r
161 if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r
162 $buf .= $c1 . $c2;\r
163 $i++;\r
164 } else { //not valid UTF8. Convert it.\r
165 $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
166 $cc2 = ($c1 & "\x3f") | "\x80";\r
167 $buf .= $cc1 . $cc2;\r
168 }\r
169 } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8\r
170 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r
171 $buf .= $c1 . $c2 . $c3;\r
172 $i = $i + 2;\r
173 } else { //not valid UTF8. Convert it.\r
174 $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
175 $cc2 = ($c1 & "\x3f") | "\x80";\r
176 $buf .= $cc1 . $cc2;\r
177 }\r
178 } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8\r
179 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r
180 $buf .= $c1 . $c2 . $c3;\r
181 $i = $i + 2;\r
182 } else { //not valid UTF8. Convert it.\r
183 $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
184 $cc2 = ($c1 & "\x3f") | "\x80";\r
185 $buf .= $cc1 . $cc2;\r
186 }\r
187 } else { //doesn't look like UTF8, but should be converted\r
188 $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
189 $cc2 = (($c1 & "\x3f") | "\x80");\r
190 $buf .= $cc1 . $cc2;\r
191 }\r
192 } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion\r
193 if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases\r
194 $buf .= self::$win1252ToUtf8[ord($c1)];\r
195 } else {\r
196 $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
197 $cc2 = (($c1 & "\x3f") | "\x80");\r
198 $buf .= $cc1 . $cc2;\r
199 }\r
200 } else { // it doesn't need convesion\r
201 $buf .= $c1;\r
202 }\r
203 }\r
204 return $buf;\r
205 } else {\r
206 return $text;\r
207 }\r
208 }\r
209\r
210 static function toWin1252($text) {\r
211 if(is_array($text)) {\r
212 foreach($text as $k => $v) {\r
213 $text[$k] = self::toWin1252($v);\r
214 }\r
215 return $text;\r
216 } elseif(is_string($text)) {\r
217 return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));\r
218 } else {\r
219 return $text;\r
220 }\r
221 }\r
222\r
223 static function toISO8859($text) {\r
224 return self::toWin1252($text);\r
225 }\r
226\r
227 static function toLatin1($text) {\r
228 return self::toWin1252($text);\r
229 }\r
230\r
231 static function fixUTF8($text){\r
232 if(is_array($text)) {\r
233 foreach($text as $k => $v) {\r
234 $text[$k] = self::fixUTF8($v);\r
235 }\r
236 return $text;\r
237 }\r
238\r
239 $last = "";\r
240 while($last <> $text){\r
241 $last = $text;\r
242 $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));\r
243 }\r
244 $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));\r
245 return $text;\r
246 }\r
c8bbe19b 247\r
1a268ba7 248 static function UTF8FixWin1252Chars($text){\r
c8bbe19b 249 // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1\r
1a268ba7
NL
250 // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.\r
251 // See: http://en.wikipedia.org/wiki/Windows-1252\r
c8bbe19b 252\r
1a268ba7
NL
253 return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);\r
254 }\r
c8bbe19b 255\r
1a268ba7
NL
256 static function removeBOM($str=""){\r
257 if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {\r
258 $str=substr($str, 3);\r
259 }\r
260 return $str;\r
261 }\r
262}