1 files changed, 262 insertions, 0 deletions
diff --git a/inc/Encoding.php b/inc/Encoding.php
new file mode 100755
index 00000000..ac107af9
--- /dev/null
+++ b/inc/Encoding.php
@@ -0,0 +1,262 @@
+<?php 
+/**
+ * @author   "Sebastián Grignoli" <grignoli@framework2.com.ar>
+ * @package  Encoding
+ * @version  1.1
+ * @link     http://www.framework2.com.ar/dzone/forceUTF8-es/
+ * @example  http://www.framework2.com.ar/dzone/forceUTF8-es/
+  */
+class Encoding {
+    
+  protected static $win1252ToUtf8 = array(
+        128 => "\xe2\x82\xac",
+        130 => "\xe2\x80\x9a",
+        131 => "\xc6\x92",
+        132 => "\xe2\x80\x9e",
+        133 => "\xe2\x80\xa6",
+        134 => "\xe2\x80\xa0",
+        135 => "\xe2\x80\xa1",
+        136 => "\xcb\x86",
+        137 => "\xe2\x80\xb0",
+        138 => "\xc5\xa0",
+        139 => "\xe2\x80\xb9",
+        140 => "\xc5\x92",
+        142 => "\xc5\xbd",
+        145 => "\xe2\x80\x98",
+        146 => "\xe2\x80\x99",
+        147 => "\xe2\x80\x9c",
+        148 => "\xe2\x80\x9d",
+        149 => "\xe2\x80\xa2",
+        150 => "\xe2\x80\x93",
+        151 => "\xe2\x80\x94",
+        152 => "\xcb\x9c",
+        153 => "\xe2\x84\xa2",
+        154 => "\xc5\xa1",
+        155 => "\xe2\x80\xba",
+        156 => "\xc5\x93",
+        158 => "\xc5\xbe",
+        159 => "\xc5\xb8"
+  );
+  
+    protected static $brokenUtf8ToUtf8 = array(
+        "\xc2\x80" => "\xe2\x82\xac",
+        
+        "\xc2\x82" => "\xe2\x80\x9a",
+        "\xc2\x83" => "\xc6\x92",
+        "\xc2\x84" => "\xe2\x80\x9e",
+        "\xc2\x85" => "\xe2\x80\xa6",
+        "\xc2\x86" => "\xe2\x80\xa0",
+        "\xc2\x87" => "\xe2\x80\xa1",
+        "\xc2\x88" => "\xcb\x86",
+        "\xc2\x89" => "\xe2\x80\xb0",
+        "\xc2\x8a" => "\xc5\xa0",
+        "\xc2\x8b" => "\xe2\x80\xb9",
+        "\xc2\x8c" => "\xc5\x92",
+        
+        "\xc2\x8e" => "\xc5\xbd",
+        
+        
+        "\xc2\x91" => "\xe2\x80\x98",
+        "\xc2\x92" => "\xe2\x80\x99",
+        "\xc2\x93" => "\xe2\x80\x9c",
+        "\xc2\x94" => "\xe2\x80\x9d",
+        "\xc2\x95" => "\xe2\x80\xa2",
+        "\xc2\x96" => "\xe2\x80\x93",
+        "\xc2\x97" => "\xe2\x80\x94",
+        "\xc2\x98" => "\xcb\x9c",
+        "\xc2\x99" => "\xe2\x84\xa2",
+        "\xc2\x9a" => "\xc5\xa1",
+        "\xc2\x9b" => "\xe2\x80\xba",
+        "\xc2\x9c" => "\xc5\x93",
+        
+        "\xc2\x9e" => "\xc5\xbe",
+        "\xc2\x9f" => "\xc5\xb8"
+  );
+    
+  protected static $utf8ToWin1252 = array(
+       "\xe2\x82\xac" => "\x80",
+       
+       "\xe2\x80\x9a" => "\x82",
+       "\xc6\x92"     => "\x83",
+       "\xe2\x80\x9e" => "\x84",
+       "\xe2\x80\xa6" => "\x85",
+       "\xe2\x80\xa0" => "\x86",
+       "\xe2\x80\xa1" => "\x87",
+       "\xcb\x86"     => "\x88",
+       "\xe2\x80\xb0" => "\x89",
+       "\xc5\xa0"     => "\x8a",
+       "\xe2\x80\xb9" => "\x8b",
+       "\xc5\x92"     => "\x8c",
+       
+       "\xc5\xbd"     => "\x8e",
+       
+       
+       "\xe2\x80\x98" => "\x91",
+       "\xe2\x80\x99" => "\x92",
+       "\xe2\x80\x9c" => "\x93",
+       "\xe2\x80\x9d" => "\x94",
+       "\xe2\x80\xa2" => "\x95",
+       "\xe2\x80\x93" => "\x96",
+       "\xe2\x80\x94" => "\x97",
+       "\xcb\x9c"     => "\x98",
+       "\xe2\x84\xa2" => "\x99",
+       "\xc5\xa1"     => "\x9a",
+       "\xe2\x80\xba" => "\x9b",
+       "\xc5\x93"     => "\x9c",
+       
+       "\xc5\xbe"     => "\x9e",
+       "\xc5\xb8"     => "\x9f"
+    );
+  static function toUTF8($text){
+  /**
+   * Function Encoding::toUTF8
+   *
+   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
+   * 
+   * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
+   *
+   * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
+   *
+   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
+   *    are followed by any of these:  ("group B")
+   *                                    ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
+   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
+   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) 
+   * is also a valid unicode character, and will be left unchanged.
+   *
+   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
+   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
+   *
+   * @name toUTF8
+   * @param string $text  Any string.
+   * @return string  The same string, UTF8 encoded
+   *
+   */
+    if(is_array($text))
+    {
+      foreach($text as $k => $v)
+      {
+        $text[$k] = self::toUTF8($v);
+      }
+      return $text;
+    } elseif(is_string($text)) {
+    
+      $max = strlen($text);
+      $buf = "";
+      for($i = 0; $i < $max; $i++){
+          $c1 = $text{$i};
+          if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
+            $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
+            $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
+            $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
+              if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
+                  if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                      $buf .= $c1 . $c2;
+                      $i++;
+                  } else { //not valid UTF8.  Convert it.
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = ($c1 & "\x3f") | "\x80";
+                      $buf .= $cc1 . $cc2;
+                  }
+              } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
+                  if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                      $buf .= $c1 . $c2 . $c3;
+                      $i = $i + 2;
+                  } else { //not valid UTF8.  Convert it.
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = ($c1 & "\x3f") | "\x80";
+                      $buf .= $cc1 . $cc2;
+                  }
+              } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
+                  if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                      $buf .= $c1 . $c2 . $c3;
+                      $i = $i + 2;
+                  } else { //not valid UTF8.  Convert it.
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = ($c1 & "\x3f") | "\x80";
+                      $buf .= $cc1 . $cc2;
+                  }
+              } else { //doesn't look like UTF8, but should be converted
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = (($c1 & "\x3f") | "\x80");
+                      $buf .= $cc1 . $cc2;
+              }
+          } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
+                if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
+                    $buf .= self::$win1252ToUtf8[ord($c1)];
+                } else {
+                  $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                  $cc2 = (($c1 & "\x3f") | "\x80");
+                  $buf .= $cc1 . $cc2;
+                }
+          } else { // it doesn't need convesion
+              $buf .= $c1;
+          }
+      }
+      return $buf;
+    } else {
+      return $text;
+    }
+  }
+  static function toWin1252($text) {
+    if(is_array($text)) {
+      foreach($text as $k => $v) {
+        $text[$k] = self::toWin1252($v);
+      }
+      return $text;
+    } elseif(is_string($text)) {
+      return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
+    } else {
+      return $text;
+    }
+  }
+  static function toISO8859($text) {
+    return self::toWin1252($text);
+  }
+  static function toLatin1($text) {
+    return self::toWin1252($text);
+  }
+  static function fixUTF8($text){
+    if(is_array($text)) {
+      foreach($text as $k => $v) {
+        $text[$k] = self::fixUTF8($v);
+      }
+      return $text;
+    }
+    $last = "";
+    while($last <> $text){
+      $last = $text;
+      $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+    }
+    $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+    return $text;
+  }
+  
+  static function UTF8FixWin1252Chars($text){
+    // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 
+    // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
+    // See: http://en.wikipedia.org/wiki/Windows-1252
+    
+    return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
+  }
+  
+  static function removeBOM($str=""){
+    if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
+      $str=substr($str, 3);
+    }
+    return $str;
+  }
+}
+\ No newline at end of file

diff --git a/inc/Encoding.php b/inc/Encoding.php new file mode 100755 index 00000000..ac107af9 --- /dev/null +++ b/inc/Encoding.php
@@ -0,0 +1,262 @@
	1	<?php
	2	/**
	3	* @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
	4	* @package Encoding
	5	* @version 1.1
	6	* @link http://www.framework2.com.ar/dzone/forceUTF8-es/
	7	* @example http://www.framework2.com.ar/dzone/forceUTF8-es/
	8	*/
	9
	10	class Encoding {
	11
	12	protected static $win1252ToUtf8 = array(
	13	128 => "\xe2\x82\xac",
	14
	15	130 => "\xe2\x80\x9a",
	16	131 => "\xc6\x92",
	17	132 => "\xe2\x80\x9e",
	18	133 => "\xe2\x80\xa6",
	19	134 => "\xe2\x80\xa0",
	20	135 => "\xe2\x80\xa1",
	21	136 => "\xcb\x86",
	22	137 => "\xe2\x80\xb0",
	23	138 => "\xc5\xa0",
	24	139 => "\xe2\x80\xb9",
	25	140 => "\xc5\x92",
	26
	27	142 => "\xc5\xbd",
	28
	29
	30	145 => "\xe2\x80\x98",
	31	146 => "\xe2\x80\x99",
	32	147 => "\xe2\x80\x9c",
	33	148 => "\xe2\x80\x9d",
	34	149 => "\xe2\x80\xa2",
	35	150 => "\xe2\x80\x93",
	36	151 => "\xe2\x80\x94",
	37	152 => "\xcb\x9c",
	38	153 => "\xe2\x84\xa2",
	39	154 => "\xc5\xa1",
	40	155 => "\xe2\x80\xba",
	41	156 => "\xc5\x93",
	42
	43	158 => "\xc5\xbe",
	44	159 => "\xc5\xb8"
	45	);
	46
	47	protected static $brokenUtf8ToUtf8 = array(
	48	"\xc2\x80" => "\xe2\x82\xac",
	49
	50	"\xc2\x82" => "\xe2\x80\x9a",
	51	"\xc2\x83" => "\xc6\x92",
	52	"\xc2\x84" => "\xe2\x80\x9e",
	53	"\xc2\x85" => "\xe2\x80\xa6",
	54	"\xc2\x86" => "\xe2\x80\xa0",
	55	"\xc2\x87" => "\xe2\x80\xa1",
	56	"\xc2\x88" => "\xcb\x86",
	57	"\xc2\x89" => "\xe2\x80\xb0",
	58	"\xc2\x8a" => "\xc5\xa0",
	59	"\xc2\x8b" => "\xe2\x80\xb9",
	60	"\xc2\x8c" => "\xc5\x92",
	61
	62	"\xc2\x8e" => "\xc5\xbd",
	63
	64
	65	"\xc2\x91" => "\xe2\x80\x98",
	66	"\xc2\x92" => "\xe2\x80\x99",
	67	"\xc2\x93" => "\xe2\x80\x9c",
	68	"\xc2\x94" => "\xe2\x80\x9d",
	69	"\xc2\x95" => "\xe2\x80\xa2",
	70	"\xc2\x96" => "\xe2\x80\x93",
	71	"\xc2\x97" => "\xe2\x80\x94",
	72	"\xc2\x98" => "\xcb\x9c",
	73	"\xc2\x99" => "\xe2\x84\xa2",
	74	"\xc2\x9a" => "\xc5\xa1",
	75	"\xc2\x9b" => "\xe2\x80\xba",
	76	"\xc2\x9c" => "\xc5\x93",
	77
	78	"\xc2\x9e" => "\xc5\xbe",
	79	"\xc2\x9f" => "\xc5\xb8"
	80	);
	81
	82	protected static $utf8ToWin1252 = array(
	83	"\xe2\x82\xac" => "\x80",
	84
	85	"\xe2\x80\x9a" => "\x82",
	86	"\xc6\x92" => "\x83",
	87	"\xe2\x80\x9e" => "\x84",
	88	"\xe2\x80\xa6" => "\x85",
	89	"\xe2\x80\xa0" => "\x86",
	90	"\xe2\x80\xa1" => "\x87",
	91	"\xcb\x86" => "\x88",
	92	"\xe2\x80\xb0" => "\x89",
	93	"\xc5\xa0" => "\x8a",
	94	"\xe2\x80\xb9" => "\x8b",
	95	"\xc5\x92" => "\x8c",
	96
	97	"\xc5\xbd" => "\x8e",
	98
	99
	100	"\xe2\x80\x98" => "\x91",
	101	"\xe2\x80\x99" => "\x92",
	102	"\xe2\x80\x9c" => "\x93",
	103	"\xe2\x80\x9d" => "\x94",
	104	"\xe2\x80\xa2" => "\x95",
	105	"\xe2\x80\x93" => "\x96",
	106	"\xe2\x80\x94" => "\x97",
	107	"\xcb\x9c" => "\x98",
	108	"\xe2\x84\xa2" => "\x99",
	109	"\xc5\xa1" => "\x9a",
	110	"\xe2\x80\xba" => "\x9b",
	111	"\xc5\x93" => "\x9c",
	112
	113	"\xc5\xbe" => "\x9e",
	114	"\xc5\xb8" => "\x9f"
	115	);
	116
	117	static function toUTF8($text){
	118	/**
	119	* Function Encoding::toUTF8
	120	*
	121	* This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
	122	*
	123	* It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
	124	*
	125	* It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
	126	*
	127	* 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
	128	* are followed by any of these: ("group B")
	129	* ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
	130	* For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
	131	* The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
	132	* is also a valid unicode character, and will be left unchanged.
	133	*
	134	* 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
	135	* 3) when any of these: ðñòó are followed by THREE chars from group B.
	136	*
	137	* @name toUTF8
	138	* @param string $text Any string.
	139	* @return string The same string, UTF8 encoded
	140	*
	141	*/
	142
	143	if(is_array($text))
	144	{
	145	foreach($text as $k => $v)
	146	{
	147	$text[$k] = self::toUTF8($v);
	148	}
	149	return $text;
	150	} elseif(is_string($text)) {
	151
	152	$max = strlen($text);
	153	$buf = "";
	154	for($i = 0; $i < $max; $i++){
	155	$c1 = $text{$i};
	156	if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
	157	$c2 = $i+1 >= $max? "\x00" : $text{$i+1};
	158	$c3 = $i+2 >= $max? "\x00" : $text{$i+2};
	159	$c4 = $i+3 >= $max? "\x00" : $text{$i+3};
	160	if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
	161	if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
	162	$buf .= $c1 . $c2;
	163	$i++;
	164	} else { //not valid UTF8. Convert it.
	165	$cc1 = (chr(ord($c1) / 64) \| "\xc0");
	166	$cc2 = ($c1 & "\x3f") \| "\x80";
	167	$buf .= $cc1 . $cc2;
	168	}
	169	} elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
	170	if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
	171	$buf .= $c1 . $c2 . $c3;
	172	$i = $i + 2;
	173	} else { //not valid UTF8. Convert it.
	174	$cc1 = (chr(ord($c1) / 64) \| "\xc0");
	175	$cc2 = ($c1 & "\x3f") \| "\x80";
	176	$buf .= $cc1 . $cc2;
	177	}
	178	} elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
	179	if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
	180	$buf .= $c1 . $c2 . $c3;
	181	$i = $i + 2;
	182	} else { //not valid UTF8. Convert it.
	183	$cc1 = (chr(ord($c1) / 64) \| "\xc0");
	184	$cc2 = ($c1 & "\x3f") \| "\x80";
	185	$buf .= $cc1 . $cc2;
	186	}
	187	} else { //doesn't look like UTF8, but should be converted
	188	$cc1 = (chr(ord($c1) / 64) \| "\xc0");
	189	$cc2 = (($c1 & "\x3f") \| "\x80");
	190	$buf .= $cc1 . $cc2;
	191	}
	192	} elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
	193	if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
	194	$buf .= self::$win1252ToUtf8[ord($c1)];
	195	} else {
	196	$cc1 = (chr(ord($c1) / 64) \| "\xc0");
	197	$cc2 = (($c1 & "\x3f") \| "\x80");
	198	$buf .= $cc1 . $cc2;
	199	}
	200	} else { // it doesn't need convesion
	201	$buf .= $c1;
	202	}
	203	}
	204	return $buf;
	205	} else {
	206	return $text;
	207	}
	208	}
	209
	210	static function toWin1252($text) {
	211	if(is_array($text)) {
	212	foreach($text as $k => $v) {
	213	$text[$k] = self::toWin1252($v);
	214	}
	215	return $text;
	216	} elseif(is_string($text)) {
	217	return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
	218	} else {
	219	return $text;
	220	}
	221	}
	222
	223	static function toISO8859($text) {
	224	return self::toWin1252($text);
	225	}
	226
	227	static function toLatin1($text) {
	228	return self::toWin1252($text);
	229	}
	230
	231	static function fixUTF8($text){
	232	if(is_array($text)) {
	233	foreach($text as $k => $v) {
	234	$text[$k] = self::fixUTF8($v);
	235	}
	236	return $text;
	237	}
	238
	239	$last = "";
	240	while($last <> $text){
	241	$last = $text;
	242	$text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
	243	}
	244	$text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
	245	return $text;
	246	}
	247
	248	static function UTF8FixWin1252Chars($text){
	249	// If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
	250	// (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
	251	// See: http://en.wikipedia.org/wiki/Windows-1252
	252
	253	return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
	254	}
	255
	256	static function removeBOM($str=""){
	257	if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
	258	$str=substr($str, 3);
	259	}
	260	return $str;
	261	}
	262	} \ No newline at end of file