diff options
Diffstat (limited to 'inc/Encoding.php')
-rwxr-xr-x | inc/Encoding.php | 262 |
1 files changed, 262 insertions, 0 deletions
diff --git a/inc/Encoding.php b/inc/Encoding.php new file mode 100755 index 00000000..ac107af9 --- /dev/null +++ b/inc/Encoding.php | |||
@@ -0,0 +1,262 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * @author "Sebastián Grignoli" <grignoli@framework2.com.ar> | ||
4 | * @package Encoding | ||
5 | * @version 1.1 | ||
6 | * @link http://www.framework2.com.ar/dzone/forceUTF8-es/ | ||
7 | * @example http://www.framework2.com.ar/dzone/forceUTF8-es/ | ||
8 | */ | ||
9 | |||
10 | class Encoding { | ||
11 | |||
12 | protected static $win1252ToUtf8 = array( | ||
13 | 128 => "\xe2\x82\xac", | ||
14 | |||
15 | 130 => "\xe2\x80\x9a", | ||
16 | 131 => "\xc6\x92", | ||
17 | 132 => "\xe2\x80\x9e", | ||
18 | 133 => "\xe2\x80\xa6", | ||
19 | 134 => "\xe2\x80\xa0", | ||
20 | 135 => "\xe2\x80\xa1", | ||
21 | 136 => "\xcb\x86", | ||
22 | 137 => "\xe2\x80\xb0", | ||
23 | 138 => "\xc5\xa0", | ||
24 | 139 => "\xe2\x80\xb9", | ||
25 | 140 => "\xc5\x92", | ||
26 | |||
27 | 142 => "\xc5\xbd", | ||
28 | |||
29 | |||
30 | 145 => "\xe2\x80\x98", | ||
31 | 146 => "\xe2\x80\x99", | ||
32 | 147 => "\xe2\x80\x9c", | ||
33 | 148 => "\xe2\x80\x9d", | ||
34 | 149 => "\xe2\x80\xa2", | ||
35 | 150 => "\xe2\x80\x93", | ||
36 | 151 => "\xe2\x80\x94", | ||
37 | 152 => "\xcb\x9c", | ||
38 | 153 => "\xe2\x84\xa2", | ||
39 | 154 => "\xc5\xa1", | ||
40 | 155 => "\xe2\x80\xba", | ||
41 | 156 => "\xc5\x93", | ||
42 | |||
43 | 158 => "\xc5\xbe", | ||
44 | 159 => "\xc5\xb8" | ||
45 | ); | ||
46 | |||
47 | protected static $brokenUtf8ToUtf8 = array( | ||
48 | "\xc2\x80" => "\xe2\x82\xac", | ||
49 | |||
50 | "\xc2\x82" => "\xe2\x80\x9a", | ||
51 | "\xc2\x83" => "\xc6\x92", | ||
52 | "\xc2\x84" => "\xe2\x80\x9e", | ||
53 | "\xc2\x85" => "\xe2\x80\xa6", | ||
54 | "\xc2\x86" => "\xe2\x80\xa0", | ||
55 | "\xc2\x87" => "\xe2\x80\xa1", | ||
56 | "\xc2\x88" => "\xcb\x86", | ||
57 | "\xc2\x89" => "\xe2\x80\xb0", | ||
58 | "\xc2\x8a" => "\xc5\xa0", | ||
59 | "\xc2\x8b" => "\xe2\x80\xb9", | ||
60 | "\xc2\x8c" => "\xc5\x92", | ||
61 | |||
62 | "\xc2\x8e" => "\xc5\xbd", | ||
63 | |||
64 | |||
65 | "\xc2\x91" => "\xe2\x80\x98", | ||
66 | "\xc2\x92" => "\xe2\x80\x99", | ||
67 | "\xc2\x93" => "\xe2\x80\x9c", | ||
68 | "\xc2\x94" => "\xe2\x80\x9d", | ||
69 | "\xc2\x95" => "\xe2\x80\xa2", | ||
70 | "\xc2\x96" => "\xe2\x80\x93", | ||
71 | "\xc2\x97" => "\xe2\x80\x94", | ||
72 | "\xc2\x98" => "\xcb\x9c", | ||
73 | "\xc2\x99" => "\xe2\x84\xa2", | ||
74 | "\xc2\x9a" => "\xc5\xa1", | ||
75 | "\xc2\x9b" => "\xe2\x80\xba", | ||
76 | "\xc2\x9c" => "\xc5\x93", | ||
77 | |||
78 | "\xc2\x9e" => "\xc5\xbe", | ||
79 | "\xc2\x9f" => "\xc5\xb8" | ||
80 | ); | ||
81 | |||
82 | protected static $utf8ToWin1252 = array( | ||
83 | "\xe2\x82\xac" => "\x80", | ||
84 | |||
85 | "\xe2\x80\x9a" => "\x82", | ||
86 | "\xc6\x92" => "\x83", | ||
87 | "\xe2\x80\x9e" => "\x84", | ||
88 | "\xe2\x80\xa6" => "\x85", | ||
89 | "\xe2\x80\xa0" => "\x86", | ||
90 | "\xe2\x80\xa1" => "\x87", | ||
91 | "\xcb\x86" => "\x88", | ||
92 | "\xe2\x80\xb0" => "\x89", | ||
93 | "\xc5\xa0" => "\x8a", | ||
94 | "\xe2\x80\xb9" => "\x8b", | ||
95 | "\xc5\x92" => "\x8c", | ||
96 | |||
97 | "\xc5\xbd" => "\x8e", | ||
98 | |||
99 | |||
100 | "\xe2\x80\x98" => "\x91", | ||
101 | "\xe2\x80\x99" => "\x92", | ||
102 | "\xe2\x80\x9c" => "\x93", | ||
103 | "\xe2\x80\x9d" => "\x94", | ||
104 | "\xe2\x80\xa2" => "\x95", | ||
105 | "\xe2\x80\x93" => "\x96", | ||
106 | "\xe2\x80\x94" => "\x97", | ||
107 | "\xcb\x9c" => "\x98", | ||
108 | "\xe2\x84\xa2" => "\x99", | ||
109 | "\xc5\xa1" => "\x9a", | ||
110 | "\xe2\x80\xba" => "\x9b", | ||
111 | "\xc5\x93" => "\x9c", | ||
112 | |||
113 | "\xc5\xbe" => "\x9e", | ||
114 | "\xc5\xb8" => "\x9f" | ||
115 | ); | ||
116 | |||
117 | static function toUTF8($text){ | ||
118 | /** | ||
119 | * Function Encoding::toUTF8 | ||
120 | * | ||
121 | * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. | ||
122 | * | ||
123 | * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. | ||
124 | * | ||
125 | * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: | ||
126 | * | ||
127 | * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß | ||
128 | * are followed by any of these: ("group B") | ||
129 | * ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿ | ||
130 | * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» | ||
131 | * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) | ||
132 | * is also a valid unicode character, and will be left unchanged. | ||
133 | * | ||
134 | * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, | ||
135 | * 3) when any of these: ðñòó are followed by THREE chars from group B. | ||
136 | * | ||
137 | * @name toUTF8 | ||
138 | * @param string $text Any string. | ||
139 | * @return string The same string, UTF8 encoded | ||
140 | * | ||
141 | */ | ||
142 | |||
143 | if(is_array($text)) | ||
144 | { | ||
145 | foreach($text as $k => $v) | ||
146 | { | ||
147 | $text[$k] = self::toUTF8($v); | ||
148 | } | ||
149 | return $text; | ||
150 | } elseif(is_string($text)) { | ||
151 | |||
152 | $max = strlen($text); | ||
153 | $buf = ""; | ||
154 | for($i = 0; $i < $max; $i++){ | ||
155 | $c1 = $text{$i}; | ||
156 | if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already | ||
157 | $c2 = $i+1 >= $max? "\x00" : $text{$i+1}; | ||
158 | $c3 = $i+2 >= $max? "\x00" : $text{$i+2}; | ||
159 | $c4 = $i+3 >= $max? "\x00" : $text{$i+3}; | ||
160 | if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 | ||
161 | if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already | ||
162 | $buf .= $c1 . $c2; | ||
163 | $i++; | ||
164 | } else { //not valid UTF8. Convert it. | ||
165 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
166 | $cc2 = ($c1 & "\x3f") | "\x80"; | ||
167 | $buf .= $cc1 . $cc2; | ||
168 | } | ||
169 | } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 | ||
170 | if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already | ||
171 | $buf .= $c1 . $c2 . $c3; | ||
172 | $i = $i + 2; | ||
173 | } else { //not valid UTF8. Convert it. | ||
174 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
175 | $cc2 = ($c1 & "\x3f") | "\x80"; | ||
176 | $buf .= $cc1 . $cc2; | ||
177 | } | ||
178 | } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 | ||
179 | if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already | ||
180 | $buf .= $c1 . $c2 . $c3; | ||
181 | $i = $i + 2; | ||
182 | } else { //not valid UTF8. Convert it. | ||
183 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
184 | $cc2 = ($c1 & "\x3f") | "\x80"; | ||
185 | $buf .= $cc1 . $cc2; | ||
186 | } | ||
187 | } else { //doesn't look like UTF8, but should be converted | ||
188 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
189 | $cc2 = (($c1 & "\x3f") | "\x80"); | ||
190 | $buf .= $cc1 . $cc2; | ||
191 | } | ||
192 | } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion | ||
193 | if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases | ||
194 | $buf .= self::$win1252ToUtf8[ord($c1)]; | ||
195 | } else { | ||
196 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
197 | $cc2 = (($c1 & "\x3f") | "\x80"); | ||
198 | $buf .= $cc1 . $cc2; | ||
199 | } | ||
200 | } else { // it doesn't need convesion | ||
201 | $buf .= $c1; | ||
202 | } | ||
203 | } | ||
204 | return $buf; | ||
205 | } else { | ||
206 | return $text; | ||
207 | } | ||
208 | } | ||
209 | |||
210 | static function toWin1252($text) { | ||
211 | if(is_array($text)) { | ||
212 | foreach($text as $k => $v) { | ||
213 | $text[$k] = self::toWin1252($v); | ||
214 | } | ||
215 | return $text; | ||
216 | } elseif(is_string($text)) { | ||
217 | return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))); | ||
218 | } else { | ||
219 | return $text; | ||
220 | } | ||
221 | } | ||
222 | |||
223 | static function toISO8859($text) { | ||
224 | return self::toWin1252($text); | ||
225 | } | ||
226 | |||
227 | static function toLatin1($text) { | ||
228 | return self::toWin1252($text); | ||
229 | } | ||
230 | |||
231 | static function fixUTF8($text){ | ||
232 | if(is_array($text)) { | ||
233 | foreach($text as $k => $v) { | ||
234 | $text[$k] = self::fixUTF8($v); | ||
235 | } | ||
236 | return $text; | ||
237 | } | ||
238 | |||
239 | $last = ""; | ||
240 | while($last <> $text){ | ||
241 | $last = $text; | ||
242 | $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); | ||
243 | } | ||
244 | $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); | ||
245 | return $text; | ||
246 | } | ||
247 | |||
248 | static function UTF8FixWin1252Chars($text){ | ||
249 | // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 | ||
250 | // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. | ||
251 | // See: http://en.wikipedia.org/wiki/Windows-1252 | ||
252 | |||
253 | return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); | ||
254 | } | ||
255 | |||
256 | static function removeBOM($str=""){ | ||
257 | if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) { | ||
258 | $str=substr($str, 3); | ||
259 | } | ||
260 | return $str; | ||
261 | } | ||
262 | } \ No newline at end of file | ||