]>
Commit | Line | Data |
---|---|---|
1a268ba7 NL |
1 | <?php \r |
2 | /**\r | |
3 | * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>\r | |
4 | * @package Encoding\r | |
5 | * @version 1.1\r | |
6 | * @link http://www.framework2.com.ar/dzone/forceUTF8-es/\r | |
7 | * @example http://www.framework2.com.ar/dzone/forceUTF8-es/\r | |
8 | */\r | |
9 | \r | |
10 | class Encoding {\r | |
11 | \r | |
12 | protected static $win1252ToUtf8 = array(\r | |
13 | 128 => "\xe2\x82\xac",\r | |
14 | \r | |
15 | 130 => "\xe2\x80\x9a",\r | |
16 | 131 => "\xc6\x92",\r | |
17 | 132 => "\xe2\x80\x9e",\r | |
18 | 133 => "\xe2\x80\xa6",\r | |
19 | 134 => "\xe2\x80\xa0",\r | |
20 | 135 => "\xe2\x80\xa1",\r | |
21 | 136 => "\xcb\x86",\r | |
22 | 137 => "\xe2\x80\xb0",\r | |
23 | 138 => "\xc5\xa0",\r | |
24 | 139 => "\xe2\x80\xb9",\r | |
25 | 140 => "\xc5\x92",\r | |
26 | \r | |
27 | 142 => "\xc5\xbd",\r | |
28 | \r | |
29 | \r | |
30 | 145 => "\xe2\x80\x98",\r | |
31 | 146 => "\xe2\x80\x99",\r | |
32 | 147 => "\xe2\x80\x9c",\r | |
33 | 148 => "\xe2\x80\x9d",\r | |
34 | 149 => "\xe2\x80\xa2",\r | |
35 | 150 => "\xe2\x80\x93",\r | |
36 | 151 => "\xe2\x80\x94",\r | |
37 | 152 => "\xcb\x9c",\r | |
38 | 153 => "\xe2\x84\xa2",\r | |
39 | 154 => "\xc5\xa1",\r | |
40 | 155 => "\xe2\x80\xba",\r | |
41 | 156 => "\xc5\x93",\r | |
42 | \r | |
43 | 158 => "\xc5\xbe",\r | |
44 | 159 => "\xc5\xb8"\r | |
45 | );\r | |
46 | \r | |
47 | protected static $brokenUtf8ToUtf8 = array(\r | |
48 | "\xc2\x80" => "\xe2\x82\xac",\r | |
49 | \r | |
50 | "\xc2\x82" => "\xe2\x80\x9a",\r | |
51 | "\xc2\x83" => "\xc6\x92",\r | |
52 | "\xc2\x84" => "\xe2\x80\x9e",\r | |
53 | "\xc2\x85" => "\xe2\x80\xa6",\r | |
54 | "\xc2\x86" => "\xe2\x80\xa0",\r | |
55 | "\xc2\x87" => "\xe2\x80\xa1",\r | |
56 | "\xc2\x88" => "\xcb\x86",\r | |
57 | "\xc2\x89" => "\xe2\x80\xb0",\r | |
58 | "\xc2\x8a" => "\xc5\xa0",\r | |
59 | "\xc2\x8b" => "\xe2\x80\xb9",\r | |
60 | "\xc2\x8c" => "\xc5\x92",\r | |
61 | \r | |
62 | "\xc2\x8e" => "\xc5\xbd",\r | |
63 | \r | |
64 | \r | |
65 | "\xc2\x91" => "\xe2\x80\x98",\r | |
66 | "\xc2\x92" => "\xe2\x80\x99",\r | |
67 | "\xc2\x93" => "\xe2\x80\x9c",\r | |
68 | "\xc2\x94" => "\xe2\x80\x9d",\r | |
69 | "\xc2\x95" => "\xe2\x80\xa2",\r | |
70 | "\xc2\x96" => "\xe2\x80\x93",\r | |
71 | "\xc2\x97" => "\xe2\x80\x94",\r | |
72 | "\xc2\x98" => "\xcb\x9c",\r | |
73 | "\xc2\x99" => "\xe2\x84\xa2",\r | |
74 | "\xc2\x9a" => "\xc5\xa1",\r | |
75 | "\xc2\x9b" => "\xe2\x80\xba",\r | |
76 | "\xc2\x9c" => "\xc5\x93",\r | |
77 | \r | |
78 | "\xc2\x9e" => "\xc5\xbe",\r | |
79 | "\xc2\x9f" => "\xc5\xb8"\r | |
80 | );\r | |
81 | \r | |
82 | protected static $utf8ToWin1252 = array(\r | |
83 | "\xe2\x82\xac" => "\x80",\r | |
84 | \r | |
85 | "\xe2\x80\x9a" => "\x82",\r | |
86 | "\xc6\x92" => "\x83",\r | |
87 | "\xe2\x80\x9e" => "\x84",\r | |
88 | "\xe2\x80\xa6" => "\x85",\r | |
89 | "\xe2\x80\xa0" => "\x86",\r | |
90 | "\xe2\x80\xa1" => "\x87",\r | |
91 | "\xcb\x86" => "\x88",\r | |
92 | "\xe2\x80\xb0" => "\x89",\r | |
93 | "\xc5\xa0" => "\x8a",\r | |
94 | "\xe2\x80\xb9" => "\x8b",\r | |
95 | "\xc5\x92" => "\x8c",\r | |
96 | \r | |
97 | "\xc5\xbd" => "\x8e",\r | |
98 | \r | |
99 | \r | |
100 | "\xe2\x80\x98" => "\x91",\r | |
101 | "\xe2\x80\x99" => "\x92",\r | |
102 | "\xe2\x80\x9c" => "\x93",\r | |
103 | "\xe2\x80\x9d" => "\x94",\r | |
104 | "\xe2\x80\xa2" => "\x95",\r | |
105 | "\xe2\x80\x93" => "\x96",\r | |
106 | "\xe2\x80\x94" => "\x97",\r | |
107 | "\xcb\x9c" => "\x98",\r | |
108 | "\xe2\x84\xa2" => "\x99",\r | |
109 | "\xc5\xa1" => "\x9a",\r | |
110 | "\xe2\x80\xba" => "\x9b",\r | |
111 | "\xc5\x93" => "\x9c",\r | |
112 | \r | |
113 | "\xc5\xbe" => "\x9e",\r | |
114 | "\xc5\xb8" => "\x9f"\r | |
115 | );\r | |
116 | \r | |
117 | static function toUTF8($text){\r | |
118 | /**\r | |
119 | * Function Encoding::toUTF8\r | |
120 | *\r | |
121 | * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.\r | |
122 | * \r | |
123 | * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.\r | |
124 | *\r | |
125 | * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:\r | |
126 | *\r | |
127 | * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß\r | |
128 | * are followed by any of these: ("group B")\r | |
129 | * ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿\r | |
130 | * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»\r | |
131 | * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) \r | |
132 | * is also a valid unicode character, and will be left unchanged.\r | |
133 | *\r | |
134 | * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,\r | |
135 | * 3) when any of these: ðñòó are followed by THREE chars from group B.\r | |
136 | *\r | |
137 | * @name toUTF8\r | |
138 | * @param string $text Any string.\r | |
139 | * @return string The same string, UTF8 encoded\r | |
140 | *\r | |
141 | */\r | |
142 | \r | |
143 | if(is_array($text))\r | |
144 | {\r | |
145 | foreach($text as $k => $v)\r | |
146 | {\r | |
147 | $text[$k] = self::toUTF8($v);\r | |
148 | }\r | |
149 | return $text;\r | |
150 | } elseif(is_string($text)) {\r | |
151 | \r | |
152 | $max = strlen($text);\r | |
153 | $buf = "";\r | |
154 | for($i = 0; $i < $max; $i++){\r | |
155 | $c1 = $text{$i};\r | |
156 | if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already\r | |
157 | $c2 = $i+1 >= $max? "\x00" : $text{$i+1};\r | |
158 | $c3 = $i+2 >= $max? "\x00" : $text{$i+2};\r | |
159 | $c4 = $i+3 >= $max? "\x00" : $text{$i+3};\r | |
160 | if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8\r | |
161 | if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r | |
162 | $buf .= $c1 . $c2;\r | |
163 | $i++;\r | |
164 | } else { //not valid UTF8. Convert it.\r | |
165 | $cc1 = (chr(ord($c1) / 64) | "\xc0");\r | |
166 | $cc2 = ($c1 & "\x3f") | "\x80";\r | |
167 | $buf .= $cc1 . $cc2;\r | |
168 | }\r | |
169 | } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8\r | |
170 | if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r | |
171 | $buf .= $c1 . $c2 . $c3;\r | |
172 | $i = $i + 2;\r | |
173 | } else { //not valid UTF8. Convert it.\r | |
174 | $cc1 = (chr(ord($c1) / 64) | "\xc0");\r | |
175 | $cc2 = ($c1 & "\x3f") | "\x80";\r | |
176 | $buf .= $cc1 . $cc2;\r | |
177 | }\r | |
178 | } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8\r | |
179 | if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r | |
180 | $buf .= $c1 . $c2 . $c3;\r | |
181 | $i = $i + 2;\r | |
182 | } else { //not valid UTF8. Convert it.\r | |
183 | $cc1 = (chr(ord($c1) / 64) | "\xc0");\r | |
184 | $cc2 = ($c1 & "\x3f") | "\x80";\r | |
185 | $buf .= $cc1 . $cc2;\r | |
186 | }\r | |
187 | } else { //doesn't look like UTF8, but should be converted\r | |
188 | $cc1 = (chr(ord($c1) / 64) | "\xc0");\r | |
189 | $cc2 = (($c1 & "\x3f") | "\x80");\r | |
190 | $buf .= $cc1 . $cc2;\r | |
191 | }\r | |
192 | } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion\r | |
193 | if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases\r | |
194 | $buf .= self::$win1252ToUtf8[ord($c1)];\r | |
195 | } else {\r | |
196 | $cc1 = (chr(ord($c1) / 64) | "\xc0");\r | |
197 | $cc2 = (($c1 & "\x3f") | "\x80");\r | |
198 | $buf .= $cc1 . $cc2;\r | |
199 | }\r | |
200 | } else { // it doesn't need convesion\r | |
201 | $buf .= $c1;\r | |
202 | }\r | |
203 | }\r | |
204 | return $buf;\r | |
205 | } else {\r | |
206 | return $text;\r | |
207 | }\r | |
208 | }\r | |
209 | \r | |
210 | static function toWin1252($text) {\r | |
211 | if(is_array($text)) {\r | |
212 | foreach($text as $k => $v) {\r | |
213 | $text[$k] = self::toWin1252($v);\r | |
214 | }\r | |
215 | return $text;\r | |
216 | } elseif(is_string($text)) {\r | |
217 | return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));\r | |
218 | } else {\r | |
219 | return $text;\r | |
220 | }\r | |
221 | }\r | |
222 | \r | |
223 | static function toISO8859($text) {\r | |
224 | return self::toWin1252($text);\r | |
225 | }\r | |
226 | \r | |
227 | static function toLatin1($text) {\r | |
228 | return self::toWin1252($text);\r | |
229 | }\r | |
230 | \r | |
231 | static function fixUTF8($text){\r | |
232 | if(is_array($text)) {\r | |
233 | foreach($text as $k => $v) {\r | |
234 | $text[$k] = self::fixUTF8($v);\r | |
235 | }\r | |
236 | return $text;\r | |
237 | }\r | |
238 | \r | |
239 | $last = "";\r | |
240 | while($last <> $text){\r | |
241 | $last = $text;\r | |
242 | $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));\r | |
243 | }\r | |
244 | $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));\r | |
245 | return $text;\r | |
246 | }\r | |
247 | \r | |
248 | static function UTF8FixWin1252Chars($text){\r | |
249 | // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 \r | |
250 | // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.\r | |
251 | // See: http://en.wikipedia.org/wiki/Windows-1252\r | |
252 | \r | |
253 | return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);\r | |
254 | }\r | |
255 | \r | |
256 | static function removeBOM($str=""){\r | |
257 | if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {\r | |
258 | $str=substr($str, 3);\r | |
259 | }\r | |
260 | return $str;\r | |
261 | }\r | |
262 | } |