inc/3rdparty/libraries/language-detect/LanguageDetect.php

   1 <?php
   2
   3 /**
   4  * Detects the language of a given piece of text.
   5  *
   6  * Attempts to detect the language of a sample of text by correlating ranked
   7  * 3-gram frequencies to a table of 3-gram frequencies of known languages.
   8  *
   9  * Implements a version of a technique originally proposed by Cavnar & Trenkle
  10  * (1994): "N-Gram-Based Text Categorization"
  11  *
  12  * PHP version 5
  13  *
  14  * @category  Text
  15  * @package   Text_LanguageDetect
  16  * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
  17  * @copyright 2005-2006 Nicholas Pisarro
  18  * @license   http://www.debian.org/misc/bsd.license BSD
  19  * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
  20  * @link      http://pear.php.net/package/Text_LanguageDetect/
  21  * @link      http://langdetect.blogspot.com/
  22  */
  23
  24 require_once 'LanguageDetect/Exception.php';
  25 require_once 'LanguageDetect/Parser.php';
  26 require_once 'LanguageDetect/ISO639.php';
  27
  28 /**
  29  * Language detection class
  30  *
  31  * Requires the langauge model database (lang.dat) that should have
  32  * accompanied this class definition in order to be instantiated.
  33  *
  34  * Example usage:
  35  *
  36  * <code>
  37  * require_once 'Text/LanguageDetect.php';
  38  *
  39  * $l = new Text_LanguageDetect;
  40  *
  41  * $stdin = fopen('php://stdin', 'r');
  42  *
  43  * echo "Supported languages:\n";
  44  *
  45  * try {
  46  *     $langs = $l->getLanguages();
  47  * } catch (Text_LanguageDetect_Exception $e) {
  48  *     die($e->getMessage());
  49  * }
  50  *
  51  * sort($langs);
  52  * echo join(', ', $langs);
  53  *
  54  * while ($line = fgets($stdin)) {
  55  *     print_r($l->detect($line, 4));
  56  * }
  57  * </code>
  58  *
  59  * @category  Text
  60  * @package   Text_LanguageDetect
  61  * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
  62  * @copyright 2005 Nicholas Pisarro
  63  * @license   http://www.debian.org/misc/bsd.license BSD
  64  * @version   Release: @package_version@
  65  * @link      http://pear.php.net/package/Text_LanguageDetect/
  66  * @todo      allow users to generate their own language models
  67  */
  68 class Text_LanguageDetect
  69 {
  70     /**
  71      * The filename that stores the trigram data for the detector
  72      *
  73      * If this value starts with a slash (/) or a dot (.) the value of
  74      * $this->_data_dir will be ignored
  75      *
  76      * @var      string
  77      * @access   private
  78      */
  79     var $_db_filename = 'lang.dat';
  80
  81     /**
  82      * The filename that stores the unicode block definitions
  83      *
  84      * If this value starts with a slash (/) or a dot (.) the value of
  85      * $this->_data_dir will be ignored
  86      *
  87      * @var string
  88      * @access private
  89      */
  90     var $_unicode_db_filename = 'unicode_blocks.dat';
  91
  92     /**
  93      * The data directory
  94      *
  95      * Should be set by PEAR installer
  96      *
  97      * @var      string
  98      * @access   private
  99      */
 100     var $_data_dir = '@data_dir@';
 101
 102     /**
 103      * The trigram data for comparison
 104      *
 105      * Will be loaded on start from $this->_db_filename
 106      *
 107      * @var      array
 108      * @access   private
 109      */
 110     var $_lang_db = array();
 111
 112     /**
 113      * stores the map of the trigram data to unicode characters
 114      *
 115      * @access private
 116      * @var array
 117      */
 118     var $_unicode_map;
 119
 120     /**
 121      * The size of the trigram data arrays
 122      *
 123      * @var      int
 124      * @access   private
 125      */
 126     var $_threshold = 300;
 127
 128     /**
 129      * the maximum possible score.
 130      *
 131      * needed for score normalization. Different depending on the
 132      * perl compatibility setting
 133      *
 134      * @access  private
 135      * @var     int
 136      * @see     setPerlCompatible()
 137      */
 138     var $_max_score = 0;
 139
 140     /**
 141      * Whether or not to simulate perl's Language::Guess exactly
 142      *
 143      * @access  private
 144      * @var     bool
 145      * @see     setPerlCompatible()
 146      */
 147     var $_perl_compatible = false;
 148
 149     /**
 150      * Whether to use the unicode block detection to speed up processing
 151      *
 152      * @access private
 153      * @var bool
 154      */
 155     var $_use_unicode_narrowing = true;
 156
 157     /**
 158      * stores the result of the clustering operation
 159      *
 160      * @access  private
 161      * @var     array
 162      * @see     clusterLanguages()
 163      */
 164     var $_clusters;
 165
 166     /**
 167      * Which type of "language names" are accepted and returned:
 168      *
 169      * 0 - language name ("english")
 170      * 2 - 2-letter ISO 639-1 code ("en")
 171      * 3 - 3-letter ISO 639-2 code ("eng")
 172      */
 173     var $_name_mode = 0;
 174
 175     /**
 176      * Constructor
 177      *
 178      * Will attempt to load the language database. If it fails, you will get
 179      * an exception.
 180      */
 181     function __construct()
 182     {
 183         $data = $this->_readdb($this->_db_filename);
 184         $this->_checkTrigram($data['trigram']);
 185         $this->_lang_db = $data['trigram'];
 186
 187         if (isset($data['trigram-unicodemap'])) {
 188             $this->_unicode_map = $data['trigram-unicodemap'];
 189         }
 190
 191         // Not yet implemented:
 192         if (isset($data['trigram-clusters'])) {
 193             $this->_clusters = $data['trigram-clusters'];
 194         }
 195     }
 196
 197     /**
 198      * Returns the path to the location of the database
 199      *
 200      * @param string $fname File name to load
 201      *
 202      * @return string expected path to the language model database
 203      * @access private
 204      */
 205     function _get_data_loc($fname)
 206     {
 207         return dirname(__FILE__).'/'.$fname;
 208     }
 209
 210     /**
 211      * Loads the language trigram database from filename
 212      *
 213      * Trigram datbase should be a serialize()'d array
 214      *
 215      * @param string $fname the filename where the data is stored
 216      *
 217      * @return array the language model data
 218      * @throws Text_LanguageDetect_Exception
 219      * @access private
 220      */
 221     function _readdb($fname)
 222     {
 223         // finds the correct data dir
 224         $fname = $this->_get_data_loc($fname);
 225
 226         // input check
 227         if (!file_exists($fname)) {
 228             throw new Text_LanguageDetect_Exception(
 229                 'Language database does not exist: ' . $fname,
 230                 Text_LanguageDetect_Exception::DB_NOT_FOUND
 231             );
 232         } elseif (!is_readable($fname)) {
 233             throw new Text_LanguageDetect_Exception(
 234                 'Language database is not readable: ' . $fname,
 235                 Text_LanguageDetect_Exception::DB_NOT_READABLE
 236             );
 237         }
 238
 239         return unserialize(file_get_contents($fname));
 240     }
 241
 242
 243     /**
 244      * Checks if this object is ready to detect languages
 245      *
 246      * @param array $trigram Trigram data from database
 247      *
 248      * @return void
 249      * @access private
 250      */
 251     function _checkTrigram($trigram)
 252     {
 253         if (!is_array($trigram)) {
 254             if (ini_get('magic_quotes_runtime')) {
 255                 throw new Text_LanguageDetect_Exception(
 256                     'Error loading database. Try turning magic_quotes_runtime off.',
 257                     Text_LanguageDetect_Exception::MAGIC_QUOTES
 258                 );
 259             }
 260             throw new Text_LanguageDetect_Exception(
 261                 'Language database is not an array.',
 262                 Text_LanguageDetect_Exception::DB_NOT_ARRAY
 263             );
 264         } elseif (empty($trigram)) {
 265             throw new Text_LanguageDetect_Exception(
 266                 'Language database has no elements.',
 267                 Text_LanguageDetect_Exception::DB_EMPTY
 268             );
 269         }
 270     }
 271
 272     /**
 273      * Omits languages
 274      *
 275      * Pass this function the name of or an array of names of
 276      * languages that you don't want considered
 277      *
 278      * If you're only expecting a limited set of languages, this can greatly
 279      * speed up processing
 280      *
 281      * @param mixed $omit_list    language name or array of names to omit
 282      * @param bool  $include_only if true will include (rather than
 283      *                            exclude) only those in the list
 284      *
 285      * @return int number of languages successfully deleted
 286      * @throws Text_LanguageDetect_Exception
 287      */
 288     public function omitLanguages($omit_list, $include_only = false)
 289     {
 290         $deleted = 0;
 291
 292         $omit_list = $this->_convertFromNameMode($omit_list);
 293
 294         if (!$include_only) {
 295             // deleting the given languages
 296             if (!is_array($omit_list)) {
 297                 $omit_list = strtolower($omit_list); // case desensitize
 298                 if (isset($this->_lang_db[$omit_list])) {
 299                     unset($this->_lang_db[$omit_list]);
 300                     $deleted++;
 301                 }
 302             } else {
 303                 foreach ($omit_list as $omit_lang) {
 304                     if (isset($this->_lang_db[$omit_lang])) {
 305                         unset($this->_lang_db[$omit_lang]);
 306                         $deleted++;
 307                     }
 308                 }
 309             }
 310
 311         } else {
 312             // deleting all except the given languages
 313             if (!is_array($omit_list)) {
 314                 $omit_list = array($omit_list);
 315             }
 316
 317             // case desensitize
 318             foreach ($omit_list as $key => $omit_lang) {
 319                 $omit_list[$key] = strtolower($omit_lang);
 320             }
 321
 322             foreach (array_keys($this->_lang_db) as $lang) {
 323                 if (!in_array($lang, $omit_list)) {
 324                     unset($this->_lang_db[$lang]);
 325                     $deleted++;
 326                 }
 327             }
 328         }
 329
 330         // reset the cluster cache if the number of languages changes
 331         // this will then have to be recalculated
 332         if (isset($this->_clusters) && $deleted > 0) {
 333             $this->_clusters = null;
 334         }
 335
 336         return $deleted;
 337     }
 338
 339
 340     /**
 341      * Returns the number of languages that this object can detect
 342      *
 343      * @access public
 344      * @return int            the number of languages
 345      * @throws   Text_LanguageDetect_Exception
 346      */
 347     function getLanguageCount()
 348     {
 349         return count($this->_lang_db);
 350     }
 351
 352     /**
 353      * Checks if the language with the given name exists in the database
 354      *
 355      * @param mixed $lang Language name or array of language names
 356      *
 357      * @return bool true if language model exists
 358      */
 359     public function languageExists($lang)
 360     {
 361         $lang = $this->_convertFromNameMode($lang);
 362
 363         if (is_string($lang)) {
 364             return isset($this->_lang_db[strtolower($lang)]);
 365
 366         } elseif (is_array($lang)) {
 367             foreach ($lang as $test_lang) {
 368                 if (!isset($this->_lang_db[strtolower($test_lang)])) {
 369                     return false;
 370                 }
 371             }
 372             return true;
 373
 374         } else {
 375             throw new Text_LanguageDetect_Exception(
 376                 'Unsupported parameter type passed to languageExists()',
 377                 Text_LanguageDetect_Exception::PARAM_TYPE
 378             );
 379         }
 380     }
 381
 382     /**
 383      * Returns the list of detectable languages
 384      *
 385      * @access public
 386      * @return array        the names of the languages known to this object<<<<<<<
 387      * @throws   Text_LanguageDetect_Exception
 388      */
 389     function getLanguages()
 390     {
 391         return $this->_convertToNameMode(
 392             array_keys($this->_lang_db)
 393         );
 394     }
 395
 396     /**
 397      * Make this object behave like Language::Guess
 398      *
 399      * @param bool $setting false to turn off perl compatibility
 400      *
 401      * @return void
 402      */
 403     public function setPerlCompatible($setting = true)
 404     {
 405         if (is_bool($setting)) { // input check
 406             $this->_perl_compatible = $setting;
 407
 408             if ($setting == true) {
 409                 $this->_max_score = $this->_threshold;
 410             } else {
 411                 $this->_max_score = 0;
 412             }
 413         }
 414
 415     }
 416
 417     /**
 418      * Sets the way how language names are accepted and returned.
 419      *
 420      * @param integer $name_mode One of the following modes:
 421      *                           0 - language name ("english")
 422      *                           2 - 2-letter ISO 639-1 code ("en")
 423      *                           3 - 3-letter ISO 639-2 code ("eng")
 424      *
 425      * @return void
 426      */
 427     function setNameMode($name_mode)
 428     {
 429         $this->_name_mode = $name_mode;
 430     }
 431
 432     /**
 433      * Whether to use unicode block ranges in detection
 434      *
 435      * Should speed up most detections if turned on (detault is on). In some
 436      * circumstances it may be slower, such as for large text samples (> 10K)
 437      * in languages that use latin scripts. In other cases it should speed up
 438      * detection noticeably.
 439      *
 440      * @param bool $setting false to turn off
 441      *
 442      * @return void
 443      */
 444     public function useUnicodeBlocks($setting = true)
 445     {
 446         if (is_bool($setting)) {
 447             $this->_use_unicode_narrowing = $setting;
 448         }
 449     }
 450
 451     /**
 452      * Converts a piece of text into trigrams
 453      *
 454      * @param string $text text to convert
 455      *
 456      * @return     array array of trigram frequencies
 457      * @access     private
 458      * @deprecated Superceded by the Text_LanguageDetect_Parser class
 459      */
 460     function _trigram($text)
 461     {
 462         $s = new Text_LanguageDetect_Parser($text);
 463         $s->prepareTrigram();
 464         $s->prepareUnicode(false);
 465         $s->setPadStart(!$this->_perl_compatible);
 466         $s->analyze();
 467         return $s->getTrigramFreqs();
 468     }
 469
 470     /**
 471      * Converts a set of trigrams from frequencies to ranks
 472      *
 473      * Thresholds (cuts off) the list at $this->_threshold
 474      *
 475      * @param array $arr array of trigram
 476      *
 477      * @return array ranks of trigrams
 478      * @access protected
 479      */
 480     function _arr_rank($arr)
 481     {
 482
 483         // sorts alphabetically first as a standard way of breaking rank ties
 484         $this->_bub_sort($arr);
 485
 486         // below might also work, but seemed to introduce errors in testing
 487         //ksort($arr);
 488         //asort($arr);
 489
 490         $rank = array();
 491
 492         $i = 0;
 493         foreach ($arr as $key => $value) {
 494             $rank[$key] = $i++;
 495
 496             // cut off at a standard threshold
 497             if ($i >= $this->_threshold) {
 498                 break;
 499             }
 500         }
 501
 502         return $rank;
 503     }
 504
 505     /**
 506      * Sorts an array by value breaking ties alphabetically
 507      *
 508      * @param array &$arr the array to sort
 509      *
 510      * @return void
 511      * @access private
 512      */
 513     function _bub_sort(&$arr)
 514     {
 515         // should do the same as this perl statement:
 516         // sort { $trigrams{$b} == $trigrams{$a}
 517         //   ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
 518
 519         // needs to sort by both key and value at once
 520         // using the key to break ties for the value
 521
 522         // converts array into an array of arrays of each key and value
 523         // may be a better way of doing this
 524         $combined = array();
 525
 526         foreach ($arr as $key => $value) {
 527             $combined[] = array($key, $value);
 528         }
 529
 530         usort($combined, array($this, '_sort_func'));
 531
 532         $replacement = array();
 533         foreach ($combined as $key => $value) {
 534             list($new_key, $new_value) = $value;
 535             $replacement[$new_key] = $new_value;
 536         }
 537
 538         $arr = $replacement;
 539     }
 540
 541     /**
 542      * Sort function used by bubble sort
 543      *
 544      * Callback function for usort().
 545      *
 546      * @param array $a first param passed by usort()
 547      * @param array $b second param passed by usort()
 548      *
 549      * @return int 1 if $a is greater, -1 if not
 550      * @see    _bub_sort()
 551      * @access private
 552      */
 553     function _sort_func($a, $b)
 554     {
 555         // each is actually a key/value pair, so that it can compare using both
 556         list($a_key, $a_value) = $a;
 557         list($b_key, $b_value) = $b;
 558
 559         if ($a_value == $b_value) {
 560             // if the values are the same, break ties using the key
 561             return strcmp($a_key, $b_key);
 562
 563         } else {
 564             // if not, just sort normally
 565             if ($a_value > $b_value) {
 566                 return -1;
 567             } else {
 568                 return 1;
 569             }
 570         }
 571
 572         // 0 should not be possible because keys must be unique
 573     }
 574
 575     /**
 576      * Calculates a linear rank-order distance statistic between two sets of
 577      * ranked trigrams
 578      *
 579      * Sums the differences in rank for each trigram. If the trigram does not
 580      * appear in both, consider it a difference of $this->_threshold.
 581      *
 582      * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
 583      * its simplicity it has been shown to be highly accurate for language
 584      * identification tasks.
 585      *
 586      * @param array $arr1 the reference set of trigram ranks
 587      * @param array $arr2 the target set of trigram ranks
 588      *
 589      * @return int the sum of the differences between the ranks of
 590      *             the two trigram sets
 591      * @access private
 592      */
 593     function _distance($arr1, $arr2)
 594     {
 595         $sumdist = 0;
 596
 597         foreach ($arr2 as $key => $value) {
 598             if (isset($arr1[$key])) {
 599                 $distance = abs($value - $arr1[$key]);
 600             } else {
 601                 // $this->_threshold sets the maximum possible distance value
 602                 // for any one pair of trigrams
 603                 $distance = $this->_threshold;
 604             }
 605             $sumdist += $distance;
 606         }
 607
 608         return $sumdist;
 609
 610         // todo: there are other distance statistics to try, e.g. relative
 611         //       entropy, but they're probably more costly to compute
 612     }
 613
 614     /**
 615      * Normalizes the score returned by _distance()
 616      *
 617      * Different if perl compatible or not
 618      *
 619      * @param int $score      the score from _distance()
 620      * @param int $base_count the number of trigrams being considered
 621      *
 622      * @return float the normalized score
 623      * @see    _distance()
 624      * @access private
 625      */
 626     function _normalize_score($score, $base_count = null)
 627     {
 628         if ($base_count === null) {
 629             $base_count = $this->_threshold;
 630         }
 631
 632         if (!$this->_perl_compatible) {
 633             return 1 - ($score / $base_count / $this->_threshold);
 634         } else {
 635             return floor($score / $base_count);
 636         }
 637     }
 638
 639
 640     /**
 641      * Detects the closeness of a sample of text to the known languages
 642      *
 643      * Calculates the statistical difference between the text and
 644      * the trigrams for each language, normalizes the score then
 645      * returns results for all languages in sorted order
 646      *
 647      * If perl compatible, the score is 300-0, 0 being most similar.
 648      * Otherwise, it's 0-1 with 1 being most similar.
 649      *
 650      * The $sample text should be at least a few sentences in length;
 651      * should be ascii-7 or utf8 encoded, if another and the mbstring extension
 652      * is present it will try to detect and convert. However, experience has
 653      * shown that mb_detect_encoding() *does not work very well* with at least
 654      * some types of encoding.
 655      *
 656      * @param string $sample a sample of text to compare.
 657      * @param int    $limit  if specified, return an array of the most likely
 658      *                       $limit languages and their scores.
 659      *
 660      * @return mixed sorted array of language scores, blank array if no
 661      *               useable text was found
 662      * @see    _distance()
 663      * @throws Text_LanguageDetect_Exception
 664      */
 665     public function detect($sample, $limit = 0)
 666     {
 667         // input check
 668         if (!Text_LanguageDetect_Parser::validateString($sample)) {
 669             return array();
 670         }
 671
 672         // check char encoding
 673         // (only if mbstring extension is compiled and PHP > 4.0.6)
 674         if (function_exists('mb_detect_encoding')
 675             && function_exists('mb_convert_encoding')
 676         ) {
 677             // mb_detect_encoding isn't very reliable, to say the least
 678             // detection should still work with a sufficient sample
 679             //  of ascii characters
 680             $encoding = mb_detect_encoding($sample);
 681
 682             // mb_detect_encoding() will return FALSE if detection fails
 683             // don't attempt conversion if that's the case
 684             if ($encoding != 'ASCII' && $encoding != 'UTF-8'
 685                 && $encoding !== false
 686             ) {
 687                 // verify the encoding exists in mb_list_encodings
 688                 if (in_array($encoding, mb_list_encodings())) {
 689                     $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
 690                 }
 691             }
 692         }
 693
 694         $sample_obj = new Text_LanguageDetect_Parser($sample);
 695         $sample_obj->prepareTrigram();
 696         if ($this->_use_unicode_narrowing) {
 697             $sample_obj->prepareUnicode();
 698         }
 699         $sample_obj->setPadStart(!$this->_perl_compatible);
 700         $sample_obj->analyze();
 701
 702         $trigram_freqs =& $sample_obj->getTrigramRanks();
 703         $trigram_count = count($trigram_freqs);
 704
 705         if ($trigram_count == 0) {
 706             return array();
 707         }
 708
 709         $scores = array();
 710
 711         // use unicode block detection to narrow down the possibilities
 712         if ($this->_use_unicode_narrowing) {
 713             $blocks =& $sample_obj->getUnicodeBlocks();
 714
 715             if (is_array($blocks)) {
 716                 $present_blocks = array_keys($blocks);
 717             } else {
 718                 throw new Text_LanguageDetect_Exception(
 719                     'Error during block detection',
 720                     Text_LanguageDetect_Exception::BLOCK_DETECTION
 721                 );
 722             }
 723
 724             $possible_langs = array();
 725
 726             foreach ($present_blocks as $blockname) {
 727                 if (isset($this->_unicode_map[$blockname])) {
 728
 729                     $possible_langs = array_merge(
 730                         $possible_langs,
 731                         array_keys($this->_unicode_map[$blockname])
 732                     );
 733
 734                     // todo: faster way to do this?
 735                 }
 736             }
 737
 738             // could also try an intersect operation rather than a union
 739             // in other words, choose languages whose trigrams contain
 740             // ALL of the unicode blocks found in this sample
 741             // would improve speed but would be completely thrown off by an
 742             // unexpected character, like an umlaut appearing in english text
 743
 744             $possible_langs = array_intersect(
 745                 array_keys($this->_lang_db),
 746                 array_unique($possible_langs)
 747             );
 748
 749             // needs to intersect it with the keys of _lang_db in case
 750             // languages have been omitted
 751
 752         } else {
 753             // or just try 'em all
 754             $possible_langs = array_keys($this->_lang_db);
 755         }
 756
 757
 758         foreach ($possible_langs as $lang) {
 759             $scores[$lang] = $this->_normalize_score(
 760                 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
 761                 $trigram_count
 762             );
 763         }
 764
 765         unset($sample_obj);
 766
 767         if ($this->_perl_compatible) {
 768             asort($scores);
 769         } else {
 770             arsort($scores);
 771         }
 772
 773         // todo: drop languages with a score of $this->_max_score?
 774
 775         // limit the number of returned scores
 776         if ($limit && is_numeric($limit)) {
 777             $limited_scores = array();
 778
 779             $i = 0;
 780             foreach ($scores as $key => $value) {
 781                 if ($i++ >= $limit) {
 782                     break;
 783                 }
 784
 785                 $limited_scores[$key] = $value;
 786             }
 787
 788             return $this->_convertToNameMode($limited_scores, true);
 789         } else {
 790             return $this->_convertToNameMode($scores, true);
 791         }
 792     }
 793
 794     /**
 795      * Returns only the most similar language to the text sample
 796      *
 797      * Calls $this->detect() and returns only the top result
 798      *
 799      * @param string $sample text to detect the language of
 800      *
 801      * @return string the name of the most likely language
 802      *                or null if no language is similar
 803      * @see    detect()
 804      * @throws Text_LanguageDetect_Exception
 805      */
 806     public function detectSimple($sample)
 807     {
 808         $scores = $this->detect($sample, 1);
 809
 810         // if top language has the maximum possible score,
 811         // then the top score will have been picked at random
 812         if (!is_array($scores) || empty($scores)
 813             || current($scores) == $this->_max_score
 814         ) {
 815             return null;
 816         } else {
 817             return key($scores);
 818         }
 819     }
 820
 821     /**
 822      * Returns an array containing the most similar language and a confidence
 823      * rating
 824      *
 825      * Confidence is a simple measure calculated from the similarity score
 826      * minus the similarity score from the next most similar language
 827      * divided by the highest possible score. Languages that have closely
 828      * related cousins (e.g. Norwegian and Danish) should generally have lower
 829      * confidence scores.
 830      *
 831      * The similarity score answers the question "How likely is the text the
 832      * returned language regardless of the other languages considered?" The
 833      * confidence score is one way of answering the question "how likely is the
 834      * text the detected language relative to the rest of the language model
 835      * set?"
 836      *
 837      * To see how similar languages are a priori, see languageSimilarity()
 838      *
 839      * @param string $sample text for which language will be detected
 840      *
 841      * @return array most similar language, score and confidence rating
 842      *               or null if no language is similar
 843      * @see    detect()
 844      * @throws Text_LanguageDetect_Exception
 845      */
 846     public function detectConfidence($sample)
 847     {
 848         $scores = $this->detect($sample, 2);
 849
 850         // if most similar language has the max score, it
 851         // will have been picked at random
 852         if (!is_array($scores) || empty($scores)
 853             || current($scores) == $this->_max_score
 854         ) {
 855             return null;
 856         }
 857
 858         $arr['language'] = key($scores);
 859         $arr['similarity'] = current($scores);
 860         if (next($scores) !== false) { // if false then no next element
 861             // the goal is to return a higher value if the distance between
 862             // the similarity of the first score and the second score is high
 863
 864             if ($this->_perl_compatible) {
 865                 $arr['confidence'] = (current($scores) - $arr['similarity'])
 866                     / $this->_max_score;
 867
 868             } else {
 869                 $arr['confidence'] = $arr['similarity'] - current($scores);
 870
 871             }
 872
 873         } else {
 874             $arr['confidence'] = null;
 875         }
 876
 877         return $arr;
 878     }
 879
 880     /**
 881      * Returns the distribution of unicode blocks in a given utf8 string
 882      *
 883      * For the block name of a single char, use unicodeBlockName()
 884      *
 885      * @param string $str          input string. Must be ascii or utf8
 886      * @param bool   $skip_symbols if true, skip ascii digits, symbols and
 887      *                             non-printing characters. Includes spaces,
 888      *                             newlines and common punctutation characters.
 889      *
 890      * @return array
 891      * @throws Text_LanguageDetect_Exception
 892      */
 893     public function detectUnicodeBlocks($str, $skip_symbols)
 894     {
 895         $skip_symbols = (bool)$skip_symbols;
 896         $str          = (string)$str;
 897
 898         $sample_obj = new Text_LanguageDetect_Parser($str);
 899         $sample_obj->prepareUnicode();
 900         $sample_obj->prepareTrigram(false);
 901         $sample_obj->setUnicodeSkipSymbols($skip_symbols);
 902         $sample_obj->analyze();
 903         $blocks = $sample_obj->getUnicodeBlocks();
 904         unset($sample_obj);
 905         return $blocks;
 906     }
 907
 908     /**
 909      * Returns the block name for a given unicode value
 910      *
 911      * If passed a string, will assume it is being passed a UTF8-formatted
 912      * character and will automatically convert. Otherwise it will assume it
 913      * is being passed a numeric unicode value.
 914      *
 915      * Make sure input is of the correct type!
 916      *
 917      * @param mixed $unicode unicode value or utf8 char
 918      *
 919      * @return mixed the block name string or false if not found
 920      * @throws Text_LanguageDetect_Exception
 921      */
 922     public function unicodeBlockName($unicode)
 923     {
 924         if (is_string($unicode)) {
 925             // assume it is being passed a utf8 char, so convert it
 926             if (self::utf8strlen($unicode) > 1) {
 927                 throw new Text_LanguageDetect_Exception(
 928                     'Pass a single char only to this method',
 929                     Text_LanguageDetect_Exception::PARAM_TYPE
 930                 );
 931             }
 932             $unicode = $this->_utf8char2unicode($unicode);
 933
 934         } elseif (!is_int($unicode)) {
 935             throw new Text_LanguageDetect_Exception(
 936                 'Input must be of type string or int.',
 937                 Text_LanguageDetect_Exception::PARAM_TYPE
 938             );
 939         }
 940
 941         $blocks = $this->_read_unicode_block_db();
 942
 943         $result = $this->_unicode_block_name($unicode, $blocks);
 944
 945         if ($result == -1) {
 946             return false;
 947         } else {
 948             return $result[2];
 949         }
 950     }
 951
 952     /**
 953      * Searches the unicode block database
 954      *
 955      * Returns the block name for a given unicode value. unicodeBlockName() is
 956      * the public interface for this function, which does input checks which
 957      * this function omits for speed.
 958      *
 959      * @param int   $unicode     the unicode value
 960      * @param array $blocks      the block database
 961      * @param int   $block_count the number of defined blocks in the database
 962      *
 963      * @return mixed Block name, -1 if it failed
 964      * @see    unicodeBlockName()
 965      * @access protected
 966      */
 967     function _unicode_block_name($unicode, $blocks, $block_count = -1)
 968     {
 969         // for a reference, see
 970         // http://www.unicode.org/Public/UNIDATA/Blocks.txt
 971
 972         // assume that ascii characters are the most common
 973         // so try it first for efficiency
 974         if ($unicode <= $blocks[0][1]) {
 975             return $blocks[0];
 976         }
 977
 978         // the optional $block_count param is for efficiency
 979         // so we this function doesn't have to run count() every time
 980         if ($block_count != -1) {
 981             $high = $block_count - 1;
 982         } else {
 983             $high = count($blocks) - 1;
 984         }
 985
 986         $low = 1; // start with 1 because ascii was 0
 987
 988         // your average binary search algorithm
 989         while ($low <= $high) {
 990             $mid = floor(($low + $high) / 2);
 991
 992             if ($unicode < $blocks[$mid][0]) {
 993                 // if it's lower than the lower bound
 994                 $high = $mid - 1;
 995
 996             } elseif ($unicode > $blocks[$mid][1]) {
 997                 // if it's higher than the upper bound
 998                 $low = $mid + 1;
 999
1000             } else {
1001                 // found it
1002                 return $blocks[$mid];
1003             }
1004         }
1005
1006         // failed to find the block
1007         return -1;
1008
1009         // todo: differentiate when it's out of range or when it falls
1010         //       into an unassigned range?
1011     }
1012
1013     /**
1014      * Brings up the unicode block database
1015      *
1016      * @return array the database of unicode block definitions
1017      * @throws Text_LanguageDetect_Exception
1018      * @access protected
1019      */
1020     function _read_unicode_block_db()
1021     {
1022         // since the unicode definitions are always going to be the same,
1023         // might as well share the memory for the db with all other instances
1024         // of this class
1025         static $data;
1026
1027         if (!isset($data)) {
1028             $data = $this->_readdb($this->_unicode_db_filename);
1029         }
1030
1031         return $data;
1032     }
1033
1034     /**
1035      * Calculate the similarities between the language models
1036      *
1037      * Use this function to see how similar languages are to each other.
1038      *
1039      * If passed 2 language names, will return just those languages compared.
1040      * If passed 1 language name, will return that language compared to
1041      * all others.
1042      * If passed none, will return an array of every language model compared
1043      * to every other one.
1044      *
1045      * @param string $lang1 the name of the first language to be compared
1046      * @param string $lang2 the name of the second language to be compared
1047      *
1048      * @return array scores of every language compared
1049      *               or the score of just the provided languages
1050      *               or null if one of the supplied languages does not exist
1051      * @throws Text_LanguageDetect_Exception
1052      */
1053     public function languageSimilarity($lang1 = null, $lang2 = null)
1054     {
1055         $lang1 = $this->_convertFromNameMode($lang1);
1056         $lang2 = $this->_convertFromNameMode($lang2);
1057         if ($lang1 != null) {
1058             $lang1 = strtolower($lang1);
1059
1060             // check if language model exists
1061             if (!isset($this->_lang_db[$lang1])) {
1062                 return null;
1063             }
1064
1065             if ($lang2 != null) {
1066                 if (!isset($this->_lang_db[$lang2])) {
1067                     // check if language model exists
1068                     return null;
1069                 }
1070
1071                 $lang2 = strtolower($lang2);
1072
1073                 // compare just these two languages
1074                 return $this->_normalize_score(
1075                     $this->_distance(
1076                         $this->_lang_db[$lang1],
1077                         $this->_lang_db[$lang2]
1078                     )
1079                 );
1080
1081             } else {
1082                 // compare just $lang1 to all languages
1083                 $return_arr = array();
1084                 foreach ($this->_lang_db as $key => $value) {
1085                     if ($key != $lang1) {
1086                         // don't compare a language to itself
1087                         $return_arr[$key] = $this->_normalize_score(
1088                             $this->_distance($this->_lang_db[$lang1], $value)
1089                         );
1090                     }
1091                 }
1092                 asort($return_arr);
1093
1094                 return $return_arr;
1095             }
1096
1097
1098         } else {
1099             // compare all languages to each other
1100             $return_arr = array();
1101             foreach (array_keys($this->_lang_db) as $lang1) {
1102                 foreach (array_keys($this->_lang_db) as $lang2) {
1103                     // skip comparing languages to themselves
1104                     if ($lang1 != $lang2) {
1105
1106                         if (isset($return_arr[$lang2][$lang1])) {
1107                             // don't re-calculate what's already been done
1108                             $return_arr[$lang1][$lang2]
1109                                 = $return_arr[$lang2][$lang1];
1110
1111                         } else {
1112                             // calculate
1113                             $return_arr[$lang1][$lang2]
1114                                 = $this->_normalize_score(
1115                                     $this->_distance(
1116                                         $this->_lang_db[$lang1],
1117                                         $this->_lang_db[$lang2]
1118                                     )
1119                                 );
1120
1121                         }
1122                     }
1123                 }
1124             }
1125             return $return_arr;
1126         }
1127     }
1128
1129     /**
1130      * Cluster known languages according to languageSimilarity()
1131      *
1132      * WARNING: this method is EXPERIMENTAL. It is not recommended for common
1133      * use, and it may disappear or its functionality may change in future
1134      * releases without notice.
1135      *
1136      * Uses a nearest neighbor technique to generate the maximum possible
1137      * number of dendograms from the similarity data.
1138      *
1139      * @access      public
1140      * @return      array language cluster data
1141      * @throws      Text_LanguageDetect_Exception
1142      * @see         languageSimilarity()
1143      * @deprecated  this function will eventually be removed and placed into
1144      *              the model generation class
1145      */
1146     function clusterLanguages()
1147     {
1148         // todo: set the maximum number of clusters
1149         // return cached result, if any
1150         if (isset($this->_clusters)) {
1151             return $this->_clusters;
1152         }
1153
1154         $langs = array_keys($this->_lang_db);
1155
1156         $arr = $this->languageSimilarity();
1157
1158         sort($langs);
1159
1160         foreach ($langs as $lang) {
1161             if (!isset($this->_lang_db[$lang])) {
1162                 throw new Text_LanguageDetect_Exception(
1163                     "missing $lang!",
1164                     Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1165                 );
1166             }
1167         }
1168
1169         // http://www.psychstat.missouristate.edu/multibook/mlt04m.html
1170         foreach ($langs as $old_key => $lang1) {
1171             $langs[$lang1] = $lang1;
1172             unset($langs[$old_key]);
1173         }
1174
1175         $result_data = $really_map = array();
1176
1177         $i = 0;
1178         while (count($langs) > 2 && $i++ < 200) {
1179             $highest_score = -1;
1180             $highest_key1 = '';
1181             $highest_key2 = '';
1182             foreach ($langs as $lang1) {
1183                 foreach ($langs as $lang2) {
1184                     if ($lang1 != $lang2
1185                         && $arr[$lang1][$lang2] > $highest_score
1186                     ) {
1187                         $highest_score = $arr[$lang1][$lang2];
1188                         $highest_key1 = $lang1;
1189                         $highest_key2 = $lang2;
1190                     }
1191                 }
1192             }
1193
1194             if (!$highest_key1) {
1195                 // should not ever happen
1196                 throw new Text_LanguageDetect_Exception(
1197                     "no highest key? (step: $i)",
1198                     Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1199                 );
1200             }
1201
1202             if ($highest_score == 0) {
1203                 // languages are perfectly dissimilar
1204                 break;
1205             }
1206
1207             // $highest_key1 and $highest_key2 are most similar
1208             $sum1 = array_sum($arr[$highest_key1]);
1209             $sum2 = array_sum($arr[$highest_key2]);
1210
1211             // use the score for the one that is most similar to the rest of
1212             // the field as the score for the group
1213             // todo: could try averaging or "centroid" method instead
1214             // seems like that might make more sense
1215             // actually nearest neighbor may be better for binary searching
1216
1217
1218             // for "Complete Linkage"/"furthest neighbor"
1219             // sign should be <
1220             // for "Single Linkage"/"nearest neighbor" method
1221             // should should be >
1222             // results seem to be pretty much the same with either method
1223
1224             // figure out which to delete and which to replace
1225             if ($sum1 > $sum2) {
1226                 $replaceme = $highest_key1;
1227                 $deleteme = $highest_key2;
1228             } else {
1229                 $replaceme = $highest_key2;
1230                 $deleteme = $highest_key1;
1231             }
1232
1233             $newkey = $replaceme . ':' . $deleteme;
1234
1235             // $replaceme is most similar to remaining languages
1236             // replace $replaceme with '$newkey', deleting $deleteme
1237
1238             // keep a record of which fork is really which language
1239             $really_lang = $replaceme;
1240             while (isset($really_map[$really_lang])) {
1241                 $really_lang = $really_map[$really_lang];
1242             }
1243             $really_map[$newkey] = $really_lang;
1244
1245
1246             // replace the best fitting key, delete the other
1247             foreach ($arr as $key1 => $arr2) {
1248                 foreach ($arr2 as $key2 => $value2) {
1249                     if ($key2 == $replaceme) {
1250                         $arr[$key1][$newkey] = $arr[$key1][$key2];
1251                         unset($arr[$key1][$key2]);
1252                         // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1253                     }
1254
1255                     if ($key1 == $replaceme) {
1256                         $arr[$newkey][$key2] = $arr[$key1][$key2];
1257                         unset($arr[$key1][$key2]);
1258                         // replacing $arr[$key1][$key2] with $arr[$newkey][$key2]
1259                     }
1260
1261                     if ($key1 == $deleteme || $key2 == $deleteme) {
1262                         // deleting $arr[$key1][$key2]
1263                         unset($arr[$key1][$key2]);
1264                     }
1265                 }
1266             }
1267
1268
1269             unset($langs[$highest_key1]);
1270             unset($langs[$highest_key2]);
1271             $langs[$newkey] = $newkey;
1272
1273
1274             // some of these may be overkill
1275             $result_data[$newkey] = array(
1276                                 'newkey' => $newkey,
1277                                 'count' => $i,
1278                                 'diff' => abs($sum1 - $sum2),
1279                                 'score' => $highest_score,
1280                                 'bestfit' => $replaceme,
1281                                 'otherfit' => $deleteme,
1282                                 'really' => $really_lang,
1283                             );
1284         }
1285
1286         $return_val = array(
1287                 'open_forks' => $langs,
1288                         // the top level of clusters
1289                         // clusters that are mutually exclusive
1290                         // or specified by a specific maximum
1291
1292                 'fork_data' => $result_data,
1293                         // data for each split
1294
1295                 'name_map' => $really_map,
1296                         // which cluster is really which language
1297                         // using the nearest neighbor technique, the cluster
1298                         // inherits all of the properties of its most-similar member
1299                         // this keeps track
1300             );
1301
1302
1303         // saves the result in the object
1304         $this->_clusters = $return_val;
1305
1306         return $return_val;
1307     }
1308
1309
1310     /**
1311      * Perform an intelligent detection based on clusterLanguages()
1312      *
1313      * WARNING: this method is EXPERIMENTAL. It is not recommended for common
1314      * use, and it may disappear or its functionality may change in future
1315      * releases without notice.
1316      *
1317      * This compares the sample text to top the top level of clusters. If the
1318      * sample is similar to the cluster it will drop down and compare it to the
1319      * languages in the cluster, and so on until it hits a leaf node.
1320      *
1321      * this should find the language in considerably fewer compares
1322      * (the equivalent of a binary search), however clusterLanguages() is costly
1323      * and the loss of accuracy from this technique is significant.
1324      *
1325      * This method may need to be 'fuzzier' in order to become more accurate.
1326      *
1327      * This function could be more useful if the universe of possible languages
1328      * was very large, however in such cases some method of Bayesian inference
1329      * might be more helpful.
1330      *
1331      * @param string $str input string
1332      *
1333      * @return array language scores (only those compared)
1334      * @throws Text_LanguageDetect_Exception
1335      * @see    clusterLanguages()
1336      */
1337     public function clusteredSearch($str)
1338     {
1339         // input check
1340         if (!Text_LanguageDetect_Parser::validateString($str)) {
1341             return array();
1342         }
1343
1344         // clusterLanguages() will return a cached result if possible
1345         // so it's safe to call it every time
1346         $result = $this->clusterLanguages();
1347
1348         $dendogram_start = $result['open_forks'];
1349         $dendogram_data  = $result['fork_data'];
1350         $dendogram_alias = $result['name_map'];
1351
1352         $sample_obj = new Text_LanguageDetect_Parser($str);
1353         $sample_obj->prepareTrigram();
1354         $sample_obj->setPadStart(!$this->_perl_compatible);
1355         $sample_obj->analyze();
1356         $sample_result = $sample_obj->getTrigramRanks();
1357         $sample_count  = count($sample_result);
1358
1359         // input check
1360         if ($sample_count == 0) {
1361             return array();
1362         }
1363
1364         $i = 0; // counts the number of steps
1365
1366         foreach ($dendogram_start as $lang) {
1367             if (isset($dendogram_alias[$lang])) {
1368                 $lang_key = $dendogram_alias[$lang];
1369             } else {
1370                 $lang_key = $lang;
1371             }
1372
1373             $scores[$lang] = $this->_normalize_score(
1374                 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1375                 $sample_count
1376             );
1377
1378             $i++;
1379         }
1380
1381         if ($this->_perl_compatible) {
1382             asort($scores);
1383         } else {
1384             arsort($scores);
1385         }
1386
1387         $top_score = current($scores);
1388         $top_key = key($scores);
1389
1390         // of starting forks, $top_key is the most similar to the sample
1391
1392         $cur_key = $top_key;
1393         while (isset($dendogram_data[$cur_key])) {
1394             $lang1 = $dendogram_data[$cur_key]['bestfit'];
1395             $lang2 = $dendogram_data[$cur_key]['otherfit'];
1396             foreach (array($lang1, $lang2) as $lang) {
1397                 if (isset($dendogram_alias[$lang])) {
1398                     $lang_key = $dendogram_alias[$lang];
1399                 } else {
1400                     $lang_key = $lang;
1401                 }
1402
1403                 $scores[$lang] = $this->_normalize_score(
1404                     $this->_distance($this->_lang_db[$lang_key], $sample_result),
1405                     $sample_count
1406                 );
1407
1408                 //todo: does not need to do same comparison again
1409             }
1410
1411             $i++;
1412
1413             if ($scores[$lang1] > $scores[$lang2]) {
1414                 $cur_key = $lang1;
1415                 $loser_key = $lang2;
1416             } else {
1417                 $cur_key = $lang2;
1418                 $loser_key = $lang1;
1419             }
1420
1421             $diff = $scores[$cur_key] - $scores[$loser_key];
1422
1423             // $cur_key ({$dendogram_alias[$cur_key]}) wins
1424             // over $loser_key ({$dendogram_alias[$loser_key]})
1425             // with a difference of $diff
1426         }
1427
1428         // found result in $i compares
1429
1430         // rather than sorting the result, preserve it so that you can see
1431         // which paths the algorithm decided to take along the tree
1432
1433         // but sometimes the last item is only the second highest
1434         if (($this->_perl_compatible  && (end($scores) > prev($scores)))
1435             || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1436         ) {
1437             $real_last_score = current($scores);
1438             $real_last_key = key($scores);
1439
1440             // swaps the 2nd-to-last item for the last item
1441             unset($scores[$real_last_key]);
1442             $scores[$real_last_key] = $real_last_score;
1443         }
1444
1445
1446         if (!$this->_perl_compatible) {
1447             $scores = array_reverse($scores, true);
1448             // second param requires php > 4.0.3
1449         }
1450
1451         return $scores;
1452     }
1453
1454     /**
1455      * ut8-safe strlen()
1456      *
1457      * Returns the numbers of characters (not bytes) in a utf8 string
1458      *
1459      * @param string $str string to get the length of
1460      *
1461      * @return int number of chars
1462      */
1463     public static function utf8strlen($str)
1464     {
1465         // utf8_decode() will convert unknown chars to '?', which is actually
1466         // ideal for counting.
1467
1468         return strlen(utf8_decode($str));
1469
1470         // idea stolen from dokuwiki
1471     }
1472
1473     /**
1474      * Returns the unicode value of a utf8 char
1475      *
1476      * @param string $char a utf8 (possibly multi-byte) char
1477      *
1478      * @return int unicode value
1479      * @access protected
1480      * @link   http://en.wikipedia.org/wiki/UTF-8
1481      */
1482     function _utf8char2unicode($char)
1483     {
1484         // strlen() here will actually get the binary length of a single char
1485         switch (strlen($char)) {
1486         case 1:
1487             // normal ASCII-7 byte
1488             // 0xxxxxxx -->  0xxxxxxx
1489             return ord($char{0});
1490
1491         case 2:
1492             // 2 byte unicode
1493             // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1494             $z = (ord($char{0}) & 0x000001F) << 6;
1495             $x = (ord($char{1}) & 0x0000003F);
1496             return ($z | $x);
1497
1498         case 3:
1499             // 3 byte unicode
1500             // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1501             $z =  (ord($char{0}) & 0x0000000F) << 12;
1502             $x1 = (ord($char{1}) & 0x0000003F) << 6;
1503             $x2 = (ord($char{2}) & 0x0000003F);
1504             return ($z | $x1 | $x2);
1505
1506         case 4:
1507             // 4 byte unicode
1508             // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1509             // 000zzzzz xxxxxxxx xxxxxxxx
1510             $z1 = (ord($char{0}) & 0x00000007) << 18;
1511             $z2 = (ord($char{1}) & 0x0000003F) << 12;
1512             $x1 = (ord($char{2}) & 0x0000003F) << 6;
1513             $x2 = (ord($char{3}) & 0x0000003F);
1514             return ($z1 | $z2 | $x1 | $x2);
1515         }
1516     }
1517
1518     /**
1519      * utf8-safe fast character iterator
1520      *
1521      * Will get the next character starting from $counter, which will then be
1522      * incremented. If a multi-byte char the bytes will be concatenated and
1523      * $counter will be incremeted by the number of bytes in the char.
1524      *
1525      * @param string $str             the string being iterated over
1526      * @param int    &$counter        the iterator, will increment by reference
1527      * @param bool   $special_convert whether to do special conversions
1528      *
1529      * @return char the next (possibly multi-byte) char from $counter
1530      * @access private
1531      */
1532     static function _next_char($str, &$counter, $special_convert = false)
1533     {
1534         $char = $str{$counter++};
1535         $ord = ord($char);
1536
1537         // for a description of the utf8 system see
1538         // http://www.phpclasses.org/browse/file/5131.html
1539
1540         // normal ascii one byte char
1541         if ($ord <= 127) {
1542             // special conversions needed for this package
1543             // (that only apply to regular ascii characters)
1544             // lower case, and convert all non-alphanumeric characters
1545             // other than "'" to space
1546             if ($special_convert && $char != ' ' && $char != "'") {
1547                 if ($ord >= 65 && $ord <= 90) { // A-Z
1548                     $char = chr($ord + 32); // lower case
1549                 } elseif ($ord < 97 || $ord > 122) { // NOT a-z
1550                     $char = ' '; // convert to space
1551                 }
1552             }
1553
1554             return $char;
1555
1556         } elseif ($ord >> 5 == 6) { // two-byte char
1557             // multi-byte chars
1558             $nextchar = $str{$counter++}; // get next byte
1559
1560             // lower-casing of non-ascii characters is still incomplete
1561
1562             if ($special_convert) {
1563                 // lower case latin accented characters
1564                 if ($ord == 195) {
1565                     $nextord = ord($nextchar);
1566                     $nextord_adj = $nextord + 64;
1567                     // for a reference, see
1568                     // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1569
1570                     // &Agrave; - &THORN; but not &times;
1571                     if ($nextord_adj >= 192
1572                         && $nextord_adj <= 222
1573                         && $nextord_adj != 215
1574                     ) {
1575                         $nextchar = chr($nextord + 32);
1576                     }
1577
1578                 } elseif ($ord == 208) {
1579                     // lower case cyrillic alphabet
1580                     $nextord = ord($nextchar);
1581                     // if A - Pe
1582                     if ($nextord >= 144 && $nextord <= 159) {
1583                         // lower case
1584                         $nextchar = chr($nextord + 32);
1585
1586                     } elseif ($nextord >= 160 && $nextord <= 175) {
1587                         // if Er - Ya
1588                         // lower case
1589                         $char = chr(209); // == $ord++
1590                         $nextchar = chr($nextord - 32);
1591                     }
1592                 }
1593             }
1594
1595             // tag on next byte
1596             return $char . $nextchar;
1597         } elseif ($ord >> 4  == 14) { // three-byte char
1598
1599             // tag on next 2 bytes
1600             return $char . $str{$counter++} . $str{$counter++};
1601
1602         } elseif ($ord >> 3 == 30) { // four-byte char
1603
1604             // tag on next 3 bytes
1605             return $char . $str{$counter++} . $str{$counter++} . $str{$counter++};
1606
1607         } else {
1608             // error?
1609         }
1610     }
1611
1612     /**
1613      * Converts an $language input parameter from the configured mode
1614      * to the language name that is used internally.
1615      *
1616      * Works for strings and arrays.
1617      *
1618      * @param string|array $lang       A language description ("english"/"en"/"eng")
1619      * @param boolean      $convertKey If $lang is an array, setting $key
1620      *                                 converts the keys to the language name.
1621      *
1622      * @return string|array Language name
1623      */
1624     function _convertFromNameMode($lang, $convertKey = false)
1625     {
1626         if ($this->_name_mode == 0) {
1627             return $lang;
1628         }
1629
1630         if ($this->_name_mode == 2) {
1631             $method = 'code2ToName';
1632         } else {
1633             $method = 'code3ToName';
1634         }
1635
1636         if (is_string($lang)) {
1637             return (string)Text_LanguageDetect_ISO639::$method($lang);
1638         }
1639
1640         $newlang = array();
1641         foreach ($lang as $key => $val) {
1642             if ($convertKey) {
1643                 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1644                 $newlang[$newkey] = $val;
1645             } else {
1646                 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1647             }
1648         }
1649         return $newlang;
1650     }
1651
1652     /**
1653      * Converts an $language output parameter from the language name that is
1654      * used internally to the configured mode.
1655      *
1656      * Works for strings and arrays.
1657      *
1658      * @param string|array $lang       A language description ("english"/"en"/"eng")
1659      * @param boolean      $convertKey If $lang is an array, setting $key
1660      *                                 converts the keys to the language name.
1661      *
1662      * @return string|array Language name
1663      */
1664     function _convertToNameMode($lang, $convertKey = false)
1665     {
1666         if ($this->_name_mode == 0) {
1667             return $lang;
1668         }
1669
1670         if ($this->_name_mode == 2) {
1671             $method = 'nameToCode2';
1672         } else {
1673             $method = 'nameToCode3';
1674         }
1675
1676         if (is_string($lang)) {
1677             return Text_LanguageDetect_ISO639::$method($lang);
1678         }
1679
1680         $newlang = array();
1681         foreach ($lang as $key => $val) {
1682             if ($convertKey) {
1683                 $newkey = Text_LanguageDetect_ISO639::$method($key);
1684                 $newlang[$newkey] = $val;
1685             } else {
1686                 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1687             }
1688         }
1689         return $newlang;
1690     }
1691 }
1692
1693 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */