--- /dev/null
+<?php\r
+/**\r
+ * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>\r
+ * @package Encoding\r
+ * @version 1.1\r
+ * @link http://www.framework2.com.ar/dzone/forceUTF8-es/\r
+ * @example http://www.framework2.com.ar/dzone/forceUTF8-es/\r
+ */\r
+\r
+class Encoding {\r
+\r
+ protected static $win1252ToUtf8 = array(\r
+ 128 => "\xe2\x82\xac",\r
+\r
+ 130 => "\xe2\x80\x9a",\r
+ 131 => "\xc6\x92",\r
+ 132 => "\xe2\x80\x9e",\r
+ 133 => "\xe2\x80\xa6",\r
+ 134 => "\xe2\x80\xa0",\r
+ 135 => "\xe2\x80\xa1",\r
+ 136 => "\xcb\x86",\r
+ 137 => "\xe2\x80\xb0",\r
+ 138 => "\xc5\xa0",\r
+ 139 => "\xe2\x80\xb9",\r
+ 140 => "\xc5\x92",\r
+\r
+ 142 => "\xc5\xbd",\r
+\r
+\r
+ 145 => "\xe2\x80\x98",\r
+ 146 => "\xe2\x80\x99",\r
+ 147 => "\xe2\x80\x9c",\r
+ 148 => "\xe2\x80\x9d",\r
+ 149 => "\xe2\x80\xa2",\r
+ 150 => "\xe2\x80\x93",\r
+ 151 => "\xe2\x80\x94",\r
+ 152 => "\xcb\x9c",\r
+ 153 => "\xe2\x84\xa2",\r
+ 154 => "\xc5\xa1",\r
+ 155 => "\xe2\x80\xba",\r
+ 156 => "\xc5\x93",\r
+\r
+ 158 => "\xc5\xbe",\r
+ 159 => "\xc5\xb8"\r
+ );\r
+\r
+ protected static $brokenUtf8ToUtf8 = array(\r
+ "\xc2\x80" => "\xe2\x82\xac",\r
+\r
+ "\xc2\x82" => "\xe2\x80\x9a",\r
+ "\xc2\x83" => "\xc6\x92",\r
+ "\xc2\x84" => "\xe2\x80\x9e",\r
+ "\xc2\x85" => "\xe2\x80\xa6",\r
+ "\xc2\x86" => "\xe2\x80\xa0",\r
+ "\xc2\x87" => "\xe2\x80\xa1",\r
+ "\xc2\x88" => "\xcb\x86",\r
+ "\xc2\x89" => "\xe2\x80\xb0",\r
+ "\xc2\x8a" => "\xc5\xa0",\r
+ "\xc2\x8b" => "\xe2\x80\xb9",\r
+ "\xc2\x8c" => "\xc5\x92",\r
+\r
+ "\xc2\x8e" => "\xc5\xbd",\r
+\r
+\r
+ "\xc2\x91" => "\xe2\x80\x98",\r
+ "\xc2\x92" => "\xe2\x80\x99",\r
+ "\xc2\x93" => "\xe2\x80\x9c",\r
+ "\xc2\x94" => "\xe2\x80\x9d",\r
+ "\xc2\x95" => "\xe2\x80\xa2",\r
+ "\xc2\x96" => "\xe2\x80\x93",\r
+ "\xc2\x97" => "\xe2\x80\x94",\r
+ "\xc2\x98" => "\xcb\x9c",\r
+ "\xc2\x99" => "\xe2\x84\xa2",\r
+ "\xc2\x9a" => "\xc5\xa1",\r
+ "\xc2\x9b" => "\xe2\x80\xba",\r
+ "\xc2\x9c" => "\xc5\x93",\r
+\r
+ "\xc2\x9e" => "\xc5\xbe",\r
+ "\xc2\x9f" => "\xc5\xb8"\r
+ );\r
+\r
+ protected static $utf8ToWin1252 = array(\r
+ "\xe2\x82\xac" => "\x80",\r
+\r
+ "\xe2\x80\x9a" => "\x82",\r
+ "\xc6\x92" => "\x83",\r
+ "\xe2\x80\x9e" => "\x84",\r
+ "\xe2\x80\xa6" => "\x85",\r
+ "\xe2\x80\xa0" => "\x86",\r
+ "\xe2\x80\xa1" => "\x87",\r
+ "\xcb\x86" => "\x88",\r
+ "\xe2\x80\xb0" => "\x89",\r
+ "\xc5\xa0" => "\x8a",\r
+ "\xe2\x80\xb9" => "\x8b",\r
+ "\xc5\x92" => "\x8c",\r
+\r
+ "\xc5\xbd" => "\x8e",\r
+\r
+\r
+ "\xe2\x80\x98" => "\x91",\r
+ "\xe2\x80\x99" => "\x92",\r
+ "\xe2\x80\x9c" => "\x93",\r
+ "\xe2\x80\x9d" => "\x94",\r
+ "\xe2\x80\xa2" => "\x95",\r
+ "\xe2\x80\x93" => "\x96",\r
+ "\xe2\x80\x94" => "\x97",\r
+ "\xcb\x9c" => "\x98",\r
+ "\xe2\x84\xa2" => "\x99",\r
+ "\xc5\xa1" => "\x9a",\r
+ "\xe2\x80\xba" => "\x9b",\r
+ "\xc5\x93" => "\x9c",\r
+\r
+ "\xc5\xbe" => "\x9e",\r
+ "\xc5\xb8" => "\x9f"\r
+ );\r
+\r
+ static function toUTF8($text){\r
+ /**\r
+ * Function Encoding::toUTF8\r
+ *\r
+ * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.\r
+ *\r
+ * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.\r
+ *\r
+ * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:\r
+ *\r
+ * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß\r
+ * are followed by any of these: ("group B")\r
+ * ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿\r
+ * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»\r
+ * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)\r
+ * is also a valid unicode character, and will be left unchanged.\r
+ *\r
+ * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,\r
+ * 3) when any of these: ðñòó are followed by THREE chars from group B.\r
+ *\r
+ * @name toUTF8\r
+ * @param string $text Any string.\r
+ * @return string The same string, UTF8 encoded\r
+ *\r
+ */\r
+\r
+ if(is_array($text))\r
+ {\r
+ foreach($text as $k => $v)\r
+ {\r
+ $text[$k] = self::toUTF8($v);\r
+ }\r
+ return $text;\r
+ } elseif(is_string($text)) {\r
+\r
+ $max = strlen($text);\r
+ $buf = "";\r
+ for($i = 0; $i < $max; $i++){\r
+ $c1 = $text{$i};\r
+ if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already\r
+ $c2 = $i+1 >= $max? "\x00" : $text{$i+1};\r
+ $c3 = $i+2 >= $max? "\x00" : $text{$i+2};\r
+ $c4 = $i+3 >= $max? "\x00" : $text{$i+3};\r
+ if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8\r
+ if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r
+ $buf .= $c1 . $c2;\r
+ $i++;\r
+ } else { //not valid UTF8. Convert it.\r
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
+ $cc2 = ($c1 & "\x3f") | "\x80";\r
+ $buf .= $cc1 . $cc2;\r
+ }\r
+ } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8\r
+ if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r
+ $buf .= $c1 . $c2 . $c3;\r
+ $i = $i + 2;\r
+ } else { //not valid UTF8. Convert it.\r
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
+ $cc2 = ($c1 & "\x3f") | "\x80";\r
+ $buf .= $cc1 . $cc2;\r
+ }\r
+ } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8\r
+ if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already\r
+ $buf .= $c1 . $c2 . $c3;\r
+ $i = $i + 2;\r
+ } else { //not valid UTF8. Convert it.\r
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
+ $cc2 = ($c1 & "\x3f") | "\x80";\r
+ $buf .= $cc1 . $cc2;\r
+ }\r
+ } else { //doesn't look like UTF8, but should be converted\r
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
+ $cc2 = (($c1 & "\x3f") | "\x80");\r
+ $buf .= $cc1 . $cc2;\r
+ }\r
+ } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion\r
+ if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases\r
+ $buf .= self::$win1252ToUtf8[ord($c1)];\r
+ } else {\r
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");\r
+ $cc2 = (($c1 & "\x3f") | "\x80");\r
+ $buf .= $cc1 . $cc2;\r
+ }\r
+ } else { // it doesn't need convesion\r
+ $buf .= $c1;\r
+ }\r
+ }\r
+ return $buf;\r
+ } else {\r
+ return $text;\r
+ }\r
+ }\r
+\r
+ static function toWin1252($text) {\r
+ if(is_array($text)) {\r
+ foreach($text as $k => $v) {\r
+ $text[$k] = self::toWin1252($v);\r
+ }\r
+ return $text;\r
+ } elseif(is_string($text)) {\r
+ return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));\r
+ } else {\r
+ return $text;\r
+ }\r
+ }\r
+\r
+ static function toISO8859($text) {\r
+ return self::toWin1252($text);\r
+ }\r
+\r
+ static function toLatin1($text) {\r
+ return self::toWin1252($text);\r
+ }\r
+\r
+ static function fixUTF8($text){\r
+ if(is_array($text)) {\r
+ foreach($text as $k => $v) {\r
+ $text[$k] = self::fixUTF8($v);\r
+ }\r
+ return $text;\r
+ }\r
+\r
+ $last = "";\r
+ while($last <> $text){\r
+ $last = $text;\r
+ $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));\r
+ }\r
+ $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));\r
+ return $text;\r
+ }\r
+\r
+ static function UTF8FixWin1252Chars($text){\r
+ // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1\r
+ // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.\r
+ // See: http://en.wikipedia.org/wiki/Windows-1252\r
+\r
+ return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);\r
+ }\r
+\r
+ static function removeBOM($str=""){\r
+ if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {\r
+ $str=substr($str, 3);\r
+ }\r
+ return $str;\r
+ }\r
+}
\ No newline at end of file