inc/3rdparty/Encoding.php

   1 <?php
   2 /**
   3  * @author   "Sebastián Grignoli" <grignoli@framework2.com.ar>
   4  * @package  Encoding
   5  * @version  1.1
   6  * @link     http://www.framework2.com.ar/dzone/forceUTF8-es/
   7  * @example  http://www.framework2.com.ar/dzone/forceUTF8-es/
   8   */
   9
  10 class Encoding {
  11
  12   protected static $win1252ToUtf8 = array(
  13         128 => "\xe2\x82\xac",
  14
  15         130 => "\xe2\x80\x9a",
  16         131 => "\xc6\x92",
  17         132 => "\xe2\x80\x9e",
  18         133 => "\xe2\x80\xa6",
  19         134 => "\xe2\x80\xa0",
  20         135 => "\xe2\x80\xa1",
  21         136 => "\xcb\x86",
  22         137 => "\xe2\x80\xb0",
  23         138 => "\xc5\xa0",
  24         139 => "\xe2\x80\xb9",
  25         140 => "\xc5\x92",
  26
  27         142 => "\xc5\xbd",
  28
  29
  30         145 => "\xe2\x80\x98",
  31         146 => "\xe2\x80\x99",
  32         147 => "\xe2\x80\x9c",
  33         148 => "\xe2\x80\x9d",
  34         149 => "\xe2\x80\xa2",
  35         150 => "\xe2\x80\x93",
  36         151 => "\xe2\x80\x94",
  37         152 => "\xcb\x9c",
  38         153 => "\xe2\x84\xa2",
  39         154 => "\xc5\xa1",
  40         155 => "\xe2\x80\xba",
  41         156 => "\xc5\x93",
  42
  43         158 => "\xc5\xbe",
  44         159 => "\xc5\xb8"
  45   );
  46
  47     protected static $brokenUtf8ToUtf8 = array(
  48         "\xc2\x80" => "\xe2\x82\xac",
  49
  50         "\xc2\x82" => "\xe2\x80\x9a",
  51         "\xc2\x83" => "\xc6\x92",
  52         "\xc2\x84" => "\xe2\x80\x9e",
  53         "\xc2\x85" => "\xe2\x80\xa6",
  54         "\xc2\x86" => "\xe2\x80\xa0",
  55         "\xc2\x87" => "\xe2\x80\xa1",
  56         "\xc2\x88" => "\xcb\x86",
  57         "\xc2\x89" => "\xe2\x80\xb0",
  58         "\xc2\x8a" => "\xc5\xa0",
  59         "\xc2\x8b" => "\xe2\x80\xb9",
  60         "\xc2\x8c" => "\xc5\x92",
  61
  62         "\xc2\x8e" => "\xc5\xbd",
  63
  64
  65         "\xc2\x91" => "\xe2\x80\x98",
  66         "\xc2\x92" => "\xe2\x80\x99",
  67         "\xc2\x93" => "\xe2\x80\x9c",
  68         "\xc2\x94" => "\xe2\x80\x9d",
  69         "\xc2\x95" => "\xe2\x80\xa2",
  70         "\xc2\x96" => "\xe2\x80\x93",
  71         "\xc2\x97" => "\xe2\x80\x94",
  72         "\xc2\x98" => "\xcb\x9c",
  73         "\xc2\x99" => "\xe2\x84\xa2",
  74         "\xc2\x9a" => "\xc5\xa1",
  75         "\xc2\x9b" => "\xe2\x80\xba",
  76         "\xc2\x9c" => "\xc5\x93",
  77
  78         "\xc2\x9e" => "\xc5\xbe",
  79         "\xc2\x9f" => "\xc5\xb8"
  80   );
  81
  82   protected static $utf8ToWin1252 = array(
  83        "\xe2\x82\xac" => "\x80",
  84
  85        "\xe2\x80\x9a" => "\x82",
  86        "\xc6\x92"     => "\x83",
  87        "\xe2\x80\x9e" => "\x84",
  88        "\xe2\x80\xa6" => "\x85",
  89        "\xe2\x80\xa0" => "\x86",
  90        "\xe2\x80\xa1" => "\x87",
  91        "\xcb\x86"     => "\x88",
  92        "\xe2\x80\xb0" => "\x89",
  93        "\xc5\xa0"     => "\x8a",
  94        "\xe2\x80\xb9" => "\x8b",
  95        "\xc5\x92"     => "\x8c",
  96
  97        "\xc5\xbd"     => "\x8e",
  98
  99
 100        "\xe2\x80\x98" => "\x91",
 101        "\xe2\x80\x99" => "\x92",
 102        "\xe2\x80\x9c" => "\x93",
 103        "\xe2\x80\x9d" => "\x94",
 104        "\xe2\x80\xa2" => "\x95",
 105        "\xe2\x80\x93" => "\x96",
 106        "\xe2\x80\x94" => "\x97",
 107        "\xcb\x9c"     => "\x98",
 108        "\xe2\x84\xa2" => "\x99",
 109        "\xc5\xa1"     => "\x9a",
 110        "\xe2\x80\xba" => "\x9b",
 111        "\xc5\x93"     => "\x9c",
 112
 113        "\xc5\xbe"     => "\x9e",
 114        "\xc5\xb8"     => "\x9f"
 115     );
 116
 117   static function toUTF8($text){
 118   /**
 119    * Function Encoding::toUTF8
 120    *
 121    * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
 122    *
 123    * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
 124    *
 125    * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
 126    *
 127    * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
 128    *    are followed by any of these:  ("group B")
 129    *                                    ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
 130    * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
 131    * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
 132    * is also a valid unicode character, and will be left unchanged.
 133    *
 134    * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
 135    * 3) when any of these: ðñòó  are followed by THREE chars from group B.
 136    *
 137    * @name toUTF8
 138    * @param string $text  Any string.
 139    * @return string  The same string, UTF8 encoded
 140    *
 141    */
 142
 143     if(is_array($text))
 144     {
 145       foreach($text as $k => $v)
 146       {
 147         $text[$k] = self::toUTF8($v);
 148       }
 149       return $text;
 150     } elseif(is_string($text)) {
 151
 152       $max = strlen($text);
 153       $buf = "";
 154       for($i = 0; $i < $max; $i++){
 155           $c1 = $text{$i};
 156           if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
 157             $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
 158             $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
 159             $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
 160               if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
 161                   if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
 162                       $buf .= $c1 . $c2;
 163                       $i++;
 164                   } else { //not valid UTF8.  Convert it.
 165                       $cc1 = (chr(ord($c1) / 64) | "\xc0");
 166                       $cc2 = ($c1 & "\x3f") | "\x80";
 167                       $buf .= $cc1 . $cc2;
 168                   }
 169               } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
 170                   if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
 171                       $buf .= $c1 . $c2 . $c3;
 172                       $i = $i + 2;
 173                   } else { //not valid UTF8.  Convert it.
 174                       $cc1 = (chr(ord($c1) / 64) | "\xc0");
 175                       $cc2 = ($c1 & "\x3f") | "\x80";
 176                       $buf .= $cc1 . $cc2;
 177                   }
 178               } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
 179                   if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
 180                       $buf .= $c1 . $c2 . $c3;
 181                       $i = $i + 2;
 182                   } else { //not valid UTF8.  Convert it.
 183                       $cc1 = (chr(ord($c1) / 64) | "\xc0");
 184                       $cc2 = ($c1 & "\x3f") | "\x80";
 185                       $buf .= $cc1 . $cc2;
 186                   }
 187               } else { //doesn't look like UTF8, but should be converted
 188                       $cc1 = (chr(ord($c1) / 64) | "\xc0");
 189                       $cc2 = (($c1 & "\x3f") | "\x80");
 190                       $buf .= $cc1 . $cc2;
 191               }
 192           } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
 193                 if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
 194                     $buf .= self::$win1252ToUtf8[ord($c1)];
 195                 } else {
 196                   $cc1 = (chr(ord($c1) / 64) | "\xc0");
 197                   $cc2 = (($c1 & "\x3f") | "\x80");
 198                   $buf .= $cc1 . $cc2;
 199                 }
 200           } else { // it doesn't need convesion
 201               $buf .= $c1;
 202           }
 203       }
 204       return $buf;
 205     } else {
 206       return $text;
 207     }
 208   }
 209
 210   static function toWin1252($text) {
 211     if(is_array($text)) {
 212       foreach($text as $k => $v) {
 213         $text[$k] = self::toWin1252($v);
 214       }
 215       return $text;
 216     } elseif(is_string($text)) {
 217       return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
 218     } else {
 219       return $text;
 220     }
 221   }
 222
 223   static function toISO8859($text) {
 224     return self::toWin1252($text);
 225   }
 226
 227   static function toLatin1($text) {
 228     return self::toWin1252($text);
 229   }
 230
 231   static function fixUTF8($text){
 232     if(is_array($text)) {
 233       foreach($text as $k => $v) {
 234         $text[$k] = self::fixUTF8($v);
 235       }
 236       return $text;
 237     }
 238
 239     $last = "";
 240     while($last <> $text){
 241       $last = $text;
 242       $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
 243     }
 244     $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
 245     return $text;
 246   }
 247
 248   static function UTF8FixWin1252Chars($text){
 249     // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
 250     // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
 251     // See: http://en.wikipedia.org/wiki/Windows-1252
 252
 253     return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
 254   }
 255
 256   static function removeBOM($str=""){
 257     if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
 258       $str=substr($str, 3);
 259     }
 260     return $str;
 261   }
 262 }