]>
Commit | Line | Data |
---|---|---|
42c80841 NL |
1 | <?php |
2 | ||
3 | /** | |
4 | * Detects the language of a given piece of text. | |
5 | * | |
6 | * Attempts to detect the language of a sample of text by correlating ranked | |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | |
8 | * | |
3ec62cf9 MR |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle |
10 | * (1994): "N-Gram-Based Text Categorization" | |
42c80841 | 11 | * |
3ec62cf9 | 12 | * PHP version 5 |
42c80841 | 13 | * |
3ec62cf9 MR |
14 | * @category Text |
15 | * @package Text_LanguageDetect | |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
17 | * @copyright 2005-2006 Nicholas Pisarro | |
18 | * @license http://www.debian.org/misc/bsd.license BSD | |
19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ | |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
21 | * @link http://langdetect.blogspot.com/ | |
42c80841 NL |
22 | */ |
23 | ||
3ec62cf9 MR |
24 | require_once 'LanguageDetect/Exception.php'; |
25 | require_once 'LanguageDetect/Parser.php'; | |
26 | require_once 'LanguageDetect/ISO639.php'; | |
42c80841 NL |
27 | |
28 | /** | |
29 | * Language detection class | |
30 | * | |
31 | * Requires the langauge model database (lang.dat) that should have | |
32 | * accompanied this class definition in order to be instantiated. | |
33 | * | |
34 | * Example usage: | |
35 | * | |
36 | * <code> | |
37 | * require_once 'Text/LanguageDetect.php'; | |
38 | * | |
39 | * $l = new Text_LanguageDetect; | |
40 | * | |
41 | * $stdin = fopen('php://stdin', 'r'); | |
42 | * | |
43 | * echo "Supported languages:\n"; | |
44 | * | |
3ec62cf9 MR |
45 | * try { |
46 | * $langs = $l->getLanguages(); | |
47 | * } catch (Text_LanguageDetect_Exception $e) { | |
48 | * die($e->getMessage()); | |
42c80841 NL |
49 | * } |
50 | * | |
51 | * sort($langs); | |
52 | * echo join(', ', $langs); | |
53 | * | |
54 | * while ($line = fgets($stdin)) { | |
55 | * print_r($l->detect($line, 4)); | |
56 | * } | |
57 | * </code> | |
58 | * | |
3ec62cf9 MR |
59 | * @category Text |
60 | * @package Text_LanguageDetect | |
61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
62 | * @copyright 2005 Nicholas Pisarro | |
63 | * @license http://www.debian.org/misc/bsd.license BSD | |
64 | * @version Release: @package_version@ | |
65 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
66 | * @todo allow users to generate their own language models | |
42c80841 | 67 | */ |
42c80841 NL |
68 | class Text_LanguageDetect |
69 | { | |
3ec62cf9 | 70 | /** |
42c80841 NL |
71 | * The filename that stores the trigram data for the detector |
72 | * | |
3ec62cf9 | 73 | * If this value starts with a slash (/) or a dot (.) the value of |
42c80841 | 74 | * $this->_data_dir will be ignored |
3ec62cf9 | 75 | * |
42c80841 NL |
76 | * @var string |
77 | * @access private | |
78 | */ | |
3ec62cf9 | 79 | var $_db_filename = 'lang.dat'; |
42c80841 NL |
80 | |
81 | /** | |
82 | * The filename that stores the unicode block definitions | |
83 | * | |
3ec62cf9 | 84 | * If this value starts with a slash (/) or a dot (.) the value of |
42c80841 | 85 | * $this->_data_dir will be ignored |
3ec62cf9 | 86 | * |
42c80841 NL |
87 | * @var string |
88 | * @access private | |
89 | */ | |
3ec62cf9 | 90 | var $_unicode_db_filename = 'unicode_blocks.dat'; |
42c80841 NL |
91 | |
92 | /** | |
93 | * The data directory | |
94 | * | |
95 | * Should be set by PEAR installer | |
96 | * | |
97 | * @var string | |
98 | * @access private | |
99 | */ | |
100 | var $_data_dir = '@data_dir@'; | |
101 | ||
102 | /** | |
103 | * The trigram data for comparison | |
42c80841 | 104 | * |
3ec62cf9 | 105 | * Will be loaded on start from $this->_db_filename |
42c80841 NL |
106 | * |
107 | * @var array | |
108 | * @access private | |
109 | */ | |
110 | var $_lang_db = array(); | |
111 | ||
112 | /** | |
113 | * stores the map of the trigram data to unicode characters | |
114 | * | |
115 | * @access private | |
116 | * @var array | |
117 | */ | |
118 | var $_unicode_map; | |
119 | ||
120 | /** | |
121 | * The size of the trigram data arrays | |
3ec62cf9 | 122 | * |
42c80841 NL |
123 | * @var int |
124 | * @access private | |
125 | */ | |
126 | var $_threshold = 300; | |
127 | ||
128 | /** | |
129 | * the maximum possible score. | |
130 | * | |
131 | * needed for score normalization. Different depending on the | |
132 | * perl compatibility setting | |
133 | * | |
134 | * @access private | |
135 | * @var int | |
136 | * @see setPerlCompatible() | |
137 | */ | |
138 | var $_max_score = 0; | |
139 | ||
140 | /** | |
141 | * Whether or not to simulate perl's Language::Guess exactly | |
3ec62cf9 | 142 | * |
42c80841 NL |
143 | * @access private |
144 | * @var bool | |
145 | * @see setPerlCompatible() | |
146 | */ | |
147 | var $_perl_compatible = false; | |
148 | ||
149 | /** | |
150 | * Whether to use the unicode block detection to speed up processing | |
151 | * | |
152 | * @access private | |
153 | * @var bool | |
154 | */ | |
155 | var $_use_unicode_narrowing = true; | |
156 | ||
157 | /** | |
158 | * stores the result of the clustering operation | |
159 | * | |
160 | * @access private | |
161 | * @var array | |
162 | * @see clusterLanguages() | |
163 | */ | |
164 | var $_clusters; | |
165 | ||
3ec62cf9 MR |
166 | /** |
167 | * Which type of "language names" are accepted and returned: | |
168 | * | |
169 | * 0 - language name ("english") | |
170 | * 2 - 2-letter ISO 639-1 code ("en") | |
171 | * 3 - 3-letter ISO 639-2 code ("eng") | |
172 | */ | |
173 | var $_name_mode = 0; | |
174 | ||
42c80841 NL |
175 | /** |
176 | * Constructor | |
177 | * | |
178 | * Will attempt to load the language database. If it fails, you will get | |
3ec62cf9 | 179 | * an exception. |
42c80841 | 180 | */ |
3ec62cf9 | 181 | function __construct() |
42c80841 | 182 | { |
42c80841 | 183 | $data = $this->_readdb($this->_db_filename); |
3ec62cf9 | 184 | $this->_checkTrigram($data['trigram']); |
42c80841 NL |
185 | $this->_lang_db = $data['trigram']; |
186 | ||
187 | if (isset($data['trigram-unicodemap'])) { | |
188 | $this->_unicode_map = $data['trigram-unicodemap']; | |
189 | } | |
190 | ||
191 | // Not yet implemented: | |
192 | if (isset($data['trigram-clusters'])) { | |
193 | $this->_clusters = $data['trigram-clusters']; | |
3ec62cf9 | 194 | } |
42c80841 NL |
195 | } |
196 | ||
197 | /** | |
198 | * Returns the path to the location of the database | |
199 | * | |
3ec62cf9 MR |
200 | * @param string $fname File name to load |
201 | * | |
202 | * @return string expected path to the language model database | |
203 | * @access private | |
42c80841 NL |
204 | */ |
205 | function _get_data_loc($fname) | |
206 | { | |
3ec62cf9 | 207 | return dirname(__FILE__).'/'.$fname; |
42c80841 NL |
208 | } |
209 | ||
210 | /** | |
211 | * Loads the language trigram database from filename | |
212 | * | |
213 | * Trigram datbase should be a serialize()'d array | |
3ec62cf9 MR |
214 | * |
215 | * @param string $fname the filename where the data is stored | |
216 | * | |
217 | * @return array the language model data | |
218 | * @throws Text_LanguageDetect_Exception | |
219 | * @access private | |
42c80841 NL |
220 | */ |
221 | function _readdb($fname) | |
222 | { | |
223 | // finds the correct data dir | |
224 | $fname = $this->_get_data_loc($fname); | |
225 | ||
226 | // input check | |
227 | if (!file_exists($fname)) { | |
3ec62cf9 MR |
228 | throw new Text_LanguageDetect_Exception( |
229 | 'Language database does not exist: ' . $fname, | |
230 | Text_LanguageDetect_Exception::DB_NOT_FOUND | |
231 | ); | |
42c80841 | 232 | } elseif (!is_readable($fname)) { |
3ec62cf9 MR |
233 | throw new Text_LanguageDetect_Exception( |
234 | 'Language database is not readable: ' . $fname, | |
235 | Text_LanguageDetect_Exception::DB_NOT_READABLE | |
236 | ); | |
42c80841 NL |
237 | } |
238 | ||
3ec62cf9 | 239 | return unserialize(file_get_contents($fname)); |
42c80841 NL |
240 | } |
241 | ||
242 | ||
243 | /** | |
244 | * Checks if this object is ready to detect languages | |
3ec62cf9 MR |
245 | * |
246 | * @param array $trigram Trigram data from database | |
247 | * | |
248 | * @return void | |
249 | * @access private | |
42c80841 | 250 | */ |
3ec62cf9 | 251 | function _checkTrigram($trigram) |
42c80841 | 252 | { |
3ec62cf9 | 253 | if (!is_array($trigram)) { |
42c80841 | 254 | if (ini_get('magic_quotes_runtime')) { |
3ec62cf9 MR |
255 | throw new Text_LanguageDetect_Exception( |
256 | 'Error loading database. Try turning magic_quotes_runtime off.', | |
257 | Text_LanguageDetect_Exception::MAGIC_QUOTES | |
258 | ); | |
42c80841 | 259 | } |
3ec62cf9 MR |
260 | throw new Text_LanguageDetect_Exception( |
261 | 'Language database is not an array.', | |
262 | Text_LanguageDetect_Exception::DB_NOT_ARRAY | |
263 | ); | |
264 | } elseif (empty($trigram)) { | |
265 | throw new Text_LanguageDetect_Exception( | |
266 | 'Language database has no elements.', | |
267 | Text_LanguageDetect_Exception::DB_EMPTY | |
268 | ); | |
42c80841 NL |
269 | } |
270 | } | |
271 | ||
272 | /** | |
273 | * Omits languages | |
274 | * | |
3ec62cf9 | 275 | * Pass this function the name of or an array of names of |
42c80841 NL |
276 | * languages that you don't want considered |
277 | * | |
3ec62cf9 | 278 | * If you're only expecting a limited set of languages, this can greatly |
42c80841 NL |
279 | * speed up processing |
280 | * | |
3ec62cf9 MR |
281 | * @param mixed $omit_list language name or array of names to omit |
282 | * @param bool $include_only if true will include (rather than | |
283 | * exclude) only those in the list | |
284 | * | |
285 | * @return int number of languages successfully deleted | |
286 | * @throws Text_LanguageDetect_Exception | |
42c80841 | 287 | */ |
3ec62cf9 | 288 | public function omitLanguages($omit_list, $include_only = false) |
42c80841 | 289 | { |
42c80841 NL |
290 | $deleted = 0; |
291 | ||
3ec62cf9 MR |
292 | $omit_list = $this->_convertFromNameMode($omit_list); |
293 | ||
42c80841 | 294 | if (!$include_only) { |
3ec62cf9 | 295 | // deleting the given languages |
42c80841 NL |
296 | if (!is_array($omit_list)) { |
297 | $omit_list = strtolower($omit_list); // case desensitize | |
298 | if (isset($this->_lang_db[$omit_list])) { | |
299 | unset($this->_lang_db[$omit_list]); | |
300 | $deleted++; | |
301 | } | |
302 | } else { | |
303 | foreach ($omit_list as $omit_lang) { | |
304 | if (isset($this->_lang_db[$omit_lang])) { | |
305 | unset($this->_lang_db[$omit_lang]); | |
306 | $deleted++; | |
3ec62cf9 | 307 | } |
42c80841 NL |
308 | } |
309 | } | |
310 | ||
42c80841 | 311 | } else { |
3ec62cf9 | 312 | // deleting all except the given languages |
42c80841 NL |
313 | if (!is_array($omit_list)) { |
314 | $omit_list = array($omit_list); | |
315 | } | |
316 | ||
317 | // case desensitize | |
318 | foreach ($omit_list as $key => $omit_lang) { | |
319 | $omit_list[$key] = strtolower($omit_lang); | |
320 | } | |
321 | ||
322 | foreach (array_keys($this->_lang_db) as $lang) { | |
323 | if (!in_array($lang, $omit_list)) { | |
324 | unset($this->_lang_db[$lang]); | |
325 | $deleted++; | |
326 | } | |
327 | } | |
328 | } | |
329 | ||
330 | // reset the cluster cache if the number of languages changes | |
331 | // this will then have to be recalculated | |
332 | if (isset($this->_clusters) && $deleted > 0) { | |
3ec62cf9 | 333 | $this->_clusters = null; |
42c80841 NL |
334 | } |
335 | ||
336 | return $deleted; | |
337 | } | |
338 | ||
339 | ||
340 | /** | |
341 | * Returns the number of languages that this object can detect | |
342 | * | |
343 | * @access public | |
344 | * @return int the number of languages | |
3ec62cf9 | 345 | * @throws Text_LanguageDetect_Exception |
42c80841 NL |
346 | */ |
347 | function getLanguageCount() | |
348 | { | |
3ec62cf9 | 349 | return count($this->_lang_db); |
42c80841 NL |
350 | } |
351 | ||
352 | /** | |
3ec62cf9 | 353 | * Checks if the language with the given name exists in the database |
42c80841 | 354 | * |
3ec62cf9 | 355 | * @param mixed $lang Language name or array of language names |
42c80841 | 356 | * |
3ec62cf9 | 357 | * @return bool true if language model exists |
42c80841 | 358 | */ |
3ec62cf9 | 359 | public function languageExists($lang) |
42c80841 | 360 | { |
3ec62cf9 | 361 | $lang = $this->_convertFromNameMode($lang); |
42c80841 | 362 | |
3ec62cf9 MR |
363 | if (is_string($lang)) { |
364 | return isset($this->_lang_db[strtolower($lang)]); | |
365 | ||
366 | } elseif (is_array($lang)) { | |
367 | foreach ($lang as $test_lang) { | |
368 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | |
369 | return false; | |
370 | } | |
42c80841 | 371 | } |
3ec62cf9 MR |
372 | return true; |
373 | ||
374 | } else { | |
375 | throw new Text_LanguageDetect_Exception( | |
376 | 'Unsupported parameter type passed to languageExists()', | |
377 | Text_LanguageDetect_Exception::PARAM_TYPE | |
378 | ); | |
42c80841 NL |
379 | } |
380 | } | |
381 | ||
382 | /** | |
383 | * Returns the list of detectable languages | |
384 | * | |
385 | * @access public | |
3ec62cf9 MR |
386 | * @return array the names of the languages known to this object<<<<<<< |
387 | * @throws Text_LanguageDetect_Exception | |
42c80841 NL |
388 | */ |
389 | function getLanguages() | |
390 | { | |
3ec62cf9 MR |
391 | return $this->_convertToNameMode( |
392 | array_keys($this->_lang_db) | |
393 | ); | |
42c80841 NL |
394 | } |
395 | ||
396 | /** | |
397 | * Make this object behave like Language::Guess | |
3ec62cf9 MR |
398 | * |
399 | * @param bool $setting false to turn off perl compatibility | |
400 | * | |
401 | * @return void | |
42c80841 | 402 | */ |
3ec62cf9 | 403 | public function setPerlCompatible($setting = true) |
42c80841 NL |
404 | { |
405 | if (is_bool($setting)) { // input check | |
406 | $this->_perl_compatible = $setting; | |
407 | ||
408 | if ($setting == true) { | |
409 | $this->_max_score = $this->_threshold; | |
410 | } else { | |
411 | $this->_max_score = 0; | |
412 | } | |
413 | } | |
414 | ||
415 | } | |
416 | ||
3ec62cf9 MR |
417 | /** |
418 | * Sets the way how language names are accepted and returned. | |
419 | * | |
420 | * @param integer $name_mode One of the following modes: | |
421 | * 0 - language name ("english") | |
422 | * 2 - 2-letter ISO 639-1 code ("en") | |
423 | * 3 - 3-letter ISO 639-2 code ("eng") | |
424 | * | |
425 | * @return void | |
426 | */ | |
427 | function setNameMode($name_mode) | |
428 | { | |
429 | $this->_name_mode = $name_mode; | |
430 | } | |
431 | ||
42c80841 NL |
432 | /** |
433 | * Whether to use unicode block ranges in detection | |
434 | * | |
435 | * Should speed up most detections if turned on (detault is on). In some | |
436 | * circumstances it may be slower, such as for large text samples (> 10K) | |
437 | * in languages that use latin scripts. In other cases it should speed up | |
438 | * detection noticeably. | |
439 | * | |
3ec62cf9 MR |
440 | * @param bool $setting false to turn off |
441 | * | |
442 | * @return void | |
42c80841 | 443 | */ |
3ec62cf9 | 444 | public function useUnicodeBlocks($setting = true) |
42c80841 NL |
445 | { |
446 | if (is_bool($setting)) { | |
447 | $this->_use_unicode_narrowing = $setting; | |
448 | } | |
449 | } | |
450 | ||
451 | /** | |
452 | * Converts a piece of text into trigrams | |
453 | * | |
3ec62cf9 | 454 | * @param string $text text to convert |
42c80841 | 455 | * |
3ec62cf9 MR |
456 | * @return array array of trigram frequencies |
457 | * @access private | |
458 | * @deprecated Superceded by the Text_LanguageDetect_Parser class | |
42c80841 NL |
459 | */ |
460 | function _trigram($text) | |
461 | { | |
3ec62cf9 | 462 | $s = new Text_LanguageDetect_Parser($text); |
42c80841 NL |
463 | $s->prepareTrigram(); |
464 | $s->prepareUnicode(false); | |
465 | $s->setPadStart(!$this->_perl_compatible); | |
466 | $s->analyze(); | |
467 | return $s->getTrigramFreqs(); | |
468 | } | |
469 | ||
470 | /** | |
471 | * Converts a set of trigrams from frequencies to ranks | |
472 | * | |
473 | * Thresholds (cuts off) the list at $this->_threshold | |
474 | * | |
3ec62cf9 MR |
475 | * @param array $arr array of trigram |
476 | * | |
477 | * @return array ranks of trigrams | |
478 | * @access protected | |
42c80841 | 479 | */ |
3ec62cf9 | 480 | function _arr_rank($arr) |
42c80841 NL |
481 | { |
482 | ||
483 | // sorts alphabetically first as a standard way of breaking rank ties | |
484 | $this->_bub_sort($arr); | |
485 | ||
486 | // below might also work, but seemed to introduce errors in testing | |
487 | //ksort($arr); | |
488 | //asort($arr); | |
489 | ||
490 | $rank = array(); | |
491 | ||
492 | $i = 0; | |
493 | foreach ($arr as $key => $value) { | |
494 | $rank[$key] = $i++; | |
495 | ||
496 | // cut off at a standard threshold | |
497 | if ($i >= $this->_threshold) { | |
498 | break; | |
499 | } | |
500 | } | |
501 | ||
502 | return $rank; | |
503 | } | |
504 | ||
505 | /** | |
506 | * Sorts an array by value breaking ties alphabetically | |
3ec62cf9 MR |
507 | * |
508 | * @param array &$arr the array to sort | |
509 | * | |
510 | * @return void | |
511 | * @access private | |
42c80841 NL |
512 | */ |
513 | function _bub_sort(&$arr) | |
514 | { | |
515 | // should do the same as this perl statement: | |
3ec62cf9 MR |
516 | // sort { $trigrams{$b} == $trigrams{$a} |
517 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | |
42c80841 NL |
518 | |
519 | // needs to sort by both key and value at once | |
520 | // using the key to break ties for the value | |
521 | ||
522 | // converts array into an array of arrays of each key and value | |
523 | // may be a better way of doing this | |
524 | $combined = array(); | |
525 | ||
526 | foreach ($arr as $key => $value) { | |
527 | $combined[] = array($key, $value); | |
528 | } | |
529 | ||
530 | usort($combined, array($this, '_sort_func')); | |
531 | ||
532 | $replacement = array(); | |
533 | foreach ($combined as $key => $value) { | |
534 | list($new_key, $new_value) = $value; | |
535 | $replacement[$new_key] = $new_value; | |
536 | } | |
537 | ||
538 | $arr = $replacement; | |
539 | } | |
540 | ||
541 | /** | |
542 | * Sort function used by bubble sort | |
543 | * | |
3ec62cf9 | 544 | * Callback function for usort(). |
42c80841 | 545 | * |
3ec62cf9 MR |
546 | * @param array $a first param passed by usort() |
547 | * @param array $b second param passed by usort() | |
548 | * | |
549 | * @return int 1 if $a is greater, -1 if not | |
550 | * @see _bub_sort() | |
551 | * @access private | |
42c80841 NL |
552 | */ |
553 | function _sort_func($a, $b) | |
554 | { | |
555 | // each is actually a key/value pair, so that it can compare using both | |
556 | list($a_key, $a_value) = $a; | |
557 | list($b_key, $b_value) = $b; | |
558 | ||
42c80841 | 559 | if ($a_value == $b_value) { |
3ec62cf9 | 560 | // if the values are the same, break ties using the key |
42c80841 NL |
561 | return strcmp($a_key, $b_key); |
562 | ||
42c80841 | 563 | } else { |
3ec62cf9 | 564 | // if not, just sort normally |
42c80841 NL |
565 | if ($a_value > $b_value) { |
566 | return -1; | |
567 | } else { | |
568 | return 1; | |
569 | } | |
570 | } | |
571 | ||
572 | // 0 should not be possible because keys must be unique | |
573 | } | |
574 | ||
575 | /** | |
3ec62cf9 | 576 | * Calculates a linear rank-order distance statistic between two sets of |
42c80841 NL |
577 | * ranked trigrams |
578 | * | |
3ec62cf9 | 579 | * Sums the differences in rank for each trigram. If the trigram does not |
42c80841 NL |
580 | * appear in both, consider it a difference of $this->_threshold. |
581 | * | |
582 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | |
583 | * its simplicity it has been shown to be highly accurate for language | |
584 | * identification tasks. | |
585 | * | |
3ec62cf9 MR |
586 | * @param array $arr1 the reference set of trigram ranks |
587 | * @param array $arr2 the target set of trigram ranks | |
588 | * | |
589 | * @return int the sum of the differences between the ranks of | |
590 | * the two trigram sets | |
591 | * @access private | |
42c80841 | 592 | */ |
3ec62cf9 | 593 | function _distance($arr1, $arr2) |
42c80841 NL |
594 | { |
595 | $sumdist = 0; | |
596 | ||
597 | foreach ($arr2 as $key => $value) { | |
598 | if (isset($arr1[$key])) { | |
599 | $distance = abs($value - $arr1[$key]); | |
600 | } else { | |
601 | // $this->_threshold sets the maximum possible distance value | |
602 | // for any one pair of trigrams | |
603 | $distance = $this->_threshold; | |
604 | } | |
605 | $sumdist += $distance; | |
606 | } | |
607 | ||
608 | return $sumdist; | |
609 | ||
610 | // todo: there are other distance statistics to try, e.g. relative | |
611 | // entropy, but they're probably more costly to compute | |
612 | } | |
613 | ||
614 | /** | |
615 | * Normalizes the score returned by _distance() | |
3ec62cf9 | 616 | * |
42c80841 NL |
617 | * Different if perl compatible or not |
618 | * | |
3ec62cf9 MR |
619 | * @param int $score the score from _distance() |
620 | * @param int $base_count the number of trigrams being considered | |
621 | * | |
622 | * @return float the normalized score | |
623 | * @see _distance() | |
624 | * @access private | |
42c80841 NL |
625 | */ |
626 | function _normalize_score($score, $base_count = null) | |
627 | { | |
628 | if ($base_count === null) { | |
629 | $base_count = $this->_threshold; | |
630 | } | |
631 | ||
632 | if (!$this->_perl_compatible) { | |
633 | return 1 - ($score / $base_count / $this->_threshold); | |
634 | } else { | |
635 | return floor($score / $base_count); | |
636 | } | |
637 | } | |
638 | ||
639 | ||
640 | /** | |
641 | * Detects the closeness of a sample of text to the known languages | |
642 | * | |
643 | * Calculates the statistical difference between the text and | |
644 | * the trigrams for each language, normalizes the score then | |
645 | * returns results for all languages in sorted order | |
646 | * | |
647 | * If perl compatible, the score is 300-0, 0 being most similar. | |
648 | * Otherwise, it's 0-1 with 1 being most similar. | |
3ec62cf9 | 649 | * |
42c80841 NL |
650 | * The $sample text should be at least a few sentences in length; |
651 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | |
652 | * is present it will try to detect and convert. However, experience has | |
3ec62cf9 | 653 | * shown that mb_detect_encoding() *does not work very well* with at least |
42c80841 NL |
654 | * some types of encoding. |
655 | * | |
3ec62cf9 MR |
656 | * @param string $sample a sample of text to compare. |
657 | * @param int $limit if specified, return an array of the most likely | |
658 | * $limit languages and their scores. | |
659 | * | |
660 | * @return mixed sorted array of language scores, blank array if no | |
661 | * useable text was found | |
662 | * @see _distance() | |
663 | * @throws Text_LanguageDetect_Exception | |
42c80841 | 664 | */ |
3ec62cf9 | 665 | public function detect($sample, $limit = 0) |
42c80841 | 666 | { |
42c80841 NL |
667 | // input check |
668 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | |
669 | return array(); | |
670 | } | |
671 | ||
672 | // check char encoding | |
673 | // (only if mbstring extension is compiled and PHP > 4.0.6) | |
3ec62cf9 MR |
674 | if (function_exists('mb_detect_encoding') |
675 | && function_exists('mb_convert_encoding') | |
676 | ) { | |
42c80841 | 677 | // mb_detect_encoding isn't very reliable, to say the least |
3ec62cf9 MR |
678 | // detection should still work with a sufficient sample |
679 | // of ascii characters | |
42c80841 NL |
680 | $encoding = mb_detect_encoding($sample); |
681 | ||
682 | // mb_detect_encoding() will return FALSE if detection fails | |
683 | // don't attempt conversion if that's the case | |
3ec62cf9 MR |
684 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' |
685 | && $encoding !== false | |
686 | ) { | |
687 | // verify the encoding exists in mb_list_encodings | |
688 | if (in_array($encoding, mb_list_encodings())) { | |
689 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | |
42c80841 NL |
690 | } |
691 | } | |
692 | } | |
693 | ||
3ec62cf9 | 694 | $sample_obj = new Text_LanguageDetect_Parser($sample); |
42c80841 NL |
695 | $sample_obj->prepareTrigram(); |
696 | if ($this->_use_unicode_narrowing) { | |
697 | $sample_obj->prepareUnicode(); | |
698 | } | |
699 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
700 | $sample_obj->analyze(); | |
701 | ||
702 | $trigram_freqs =& $sample_obj->getTrigramRanks(); | |
703 | $trigram_count = count($trigram_freqs); | |
704 | ||
705 | if ($trigram_count == 0) { | |
706 | return array(); | |
707 | } | |
708 | ||
709 | $scores = array(); | |
710 | ||
711 | // use unicode block detection to narrow down the possibilities | |
712 | if ($this->_use_unicode_narrowing) { | |
713 | $blocks =& $sample_obj->getUnicodeBlocks(); | |
714 | ||
715 | if (is_array($blocks)) { | |
716 | $present_blocks = array_keys($blocks); | |
717 | } else { | |
3ec62cf9 MR |
718 | throw new Text_LanguageDetect_Exception( |
719 | 'Error during block detection', | |
720 | Text_LanguageDetect_Exception::BLOCK_DETECTION | |
721 | ); | |
42c80841 NL |
722 | } |
723 | ||
724 | $possible_langs = array(); | |
725 | ||
726 | foreach ($present_blocks as $blockname) { | |
727 | if (isset($this->_unicode_map[$blockname])) { | |
728 | ||
729 | $possible_langs = array_merge( | |
730 | $possible_langs, | |
731 | array_keys($this->_unicode_map[$blockname]) | |
732 | ); | |
733 | ||
734 | // todo: faster way to do this? | |
735 | } | |
736 | } | |
737 | ||
738 | // could also try an intersect operation rather than a union | |
3ec62cf9 | 739 | // in other words, choose languages whose trigrams contain |
42c80841 NL |
740 | // ALL of the unicode blocks found in this sample |
741 | // would improve speed but would be completely thrown off by an | |
742 | // unexpected character, like an umlaut appearing in english text | |
743 | ||
744 | $possible_langs = array_intersect( | |
3ec62cf9 MR |
745 | array_keys($this->_lang_db), |
746 | array_unique($possible_langs) | |
42c80841 NL |
747 | ); |
748 | ||
3ec62cf9 | 749 | // needs to intersect it with the keys of _lang_db in case |
42c80841 NL |
750 | // languages have been omitted |
751 | ||
42c80841 | 752 | } else { |
3ec62cf9 | 753 | // or just try 'em all |
42c80841 NL |
754 | $possible_langs = array_keys($this->_lang_db); |
755 | } | |
756 | ||
757 | ||
758 | foreach ($possible_langs as $lang) { | |
3ec62cf9 MR |
759 | $scores[$lang] = $this->_normalize_score( |
760 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | |
761 | $trigram_count | |
762 | ); | |
42c80841 NL |
763 | } |
764 | ||
765 | unset($sample_obj); | |
766 | ||
767 | if ($this->_perl_compatible) { | |
768 | asort($scores); | |
769 | } else { | |
770 | arsort($scores); | |
771 | } | |
772 | ||
773 | // todo: drop languages with a score of $this->_max_score? | |
774 | ||
775 | // limit the number of returned scores | |
776 | if ($limit && is_numeric($limit)) { | |
777 | $limited_scores = array(); | |
778 | ||
779 | $i = 0; | |
42c80841 NL |
780 | foreach ($scores as $key => $value) { |
781 | if ($i++ >= $limit) { | |
782 | break; | |
783 | } | |
784 | ||
785 | $limited_scores[$key] = $value; | |
786 | } | |
787 | ||
3ec62cf9 | 788 | return $this->_convertToNameMode($limited_scores, true); |
42c80841 | 789 | } else { |
3ec62cf9 | 790 | return $this->_convertToNameMode($scores, true); |
42c80841 NL |
791 | } |
792 | } | |
793 | ||
794 | /** | |
795 | * Returns only the most similar language to the text sample | |
796 | * | |
797 | * Calls $this->detect() and returns only the top result | |
3ec62cf9 MR |
798 | * |
799 | * @param string $sample text to detect the language of | |
800 | * | |
801 | * @return string the name of the most likely language | |
802 | * or null if no language is similar | |
803 | * @see detect() | |
804 | * @throws Text_LanguageDetect_Exception | |
42c80841 | 805 | */ |
3ec62cf9 | 806 | public function detectSimple($sample) |
42c80841 NL |
807 | { |
808 | $scores = $this->detect($sample, 1); | |
809 | ||
810 | // if top language has the maximum possible score, | |
811 | // then the top score will have been picked at random | |
3ec62cf9 MR |
812 | if (!is_array($scores) || empty($scores) |
813 | || current($scores) == $this->_max_score | |
814 | ) { | |
42c80841 | 815 | return null; |
42c80841 | 816 | } else { |
3ec62cf9 | 817 | return key($scores); |
42c80841 NL |
818 | } |
819 | } | |
820 | ||
821 | /** | |
822 | * Returns an array containing the most similar language and a confidence | |
823 | * rating | |
3ec62cf9 | 824 | * |
42c80841 NL |
825 | * Confidence is a simple measure calculated from the similarity score |
826 | * minus the similarity score from the next most similar language | |
827 | * divided by the highest possible score. Languages that have closely | |
828 | * related cousins (e.g. Norwegian and Danish) should generally have lower | |
829 | * confidence scores. | |
830 | * | |
831 | * The similarity score answers the question "How likely is the text the | |
3ec62cf9 | 832 | * returned language regardless of the other languages considered?" The |
42c80841 NL |
833 | * confidence score is one way of answering the question "how likely is the |
834 | * text the detected language relative to the rest of the language model | |
835 | * set?" | |
836 | * | |
837 | * To see how similar languages are a priori, see languageSimilarity() | |
3ec62cf9 MR |
838 | * |
839 | * @param string $sample text for which language will be detected | |
840 | * | |
841 | * @return array most similar language, score and confidence rating | |
842 | * or null if no language is similar | |
843 | * @see detect() | |
844 | * @throws Text_LanguageDetect_Exception | |
42c80841 | 845 | */ |
3ec62cf9 | 846 | public function detectConfidence($sample) |
42c80841 NL |
847 | { |
848 | $scores = $this->detect($sample, 2); | |
849 | ||
3ec62cf9 | 850 | // if most similar language has the max score, it |
42c80841 | 851 | // will have been picked at random |
3ec62cf9 MR |
852 | if (!is_array($scores) || empty($scores) |
853 | || current($scores) == $this->_max_score | |
854 | ) { | |
42c80841 NL |
855 | return null; |
856 | } | |
857 | ||
3ec62cf9 | 858 | $arr['language'] = key($scores); |
42c80841 NL |
859 | $arr['similarity'] = current($scores); |
860 | if (next($scores) !== false) { // if false then no next element | |
861 | // the goal is to return a higher value if the distance between | |
862 | // the similarity of the first score and the second score is high | |
863 | ||
864 | if ($this->_perl_compatible) { | |
3ec62cf9 MR |
865 | $arr['confidence'] = (current($scores) - $arr['similarity']) |
866 | / $this->_max_score; | |
42c80841 NL |
867 | |
868 | } else { | |
42c80841 NL |
869 | $arr['confidence'] = $arr['similarity'] - current($scores); |
870 | ||
871 | } | |
872 | ||
873 | } else { | |
874 | $arr['confidence'] = null; | |
875 | } | |
876 | ||
877 | return $arr; | |
878 | } | |
879 | ||
880 | /** | |
881 | * Returns the distribution of unicode blocks in a given utf8 string | |
882 | * | |
883 | * For the block name of a single char, use unicodeBlockName() | |
3ec62cf9 MR |
884 | * |
885 | * @param string $str input string. Must be ascii or utf8 | |
886 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | |
887 | * non-printing characters. Includes spaces, | |
888 | * newlines and common punctutation characters. | |
889 | * | |
42c80841 | 890 | * @return array |
3ec62cf9 | 891 | * @throws Text_LanguageDetect_Exception |
42c80841 | 892 | */ |
3ec62cf9 | 893 | public function detectUnicodeBlocks($str, $skip_symbols) |
42c80841 | 894 | { |
3ec62cf9 MR |
895 | $skip_symbols = (bool)$skip_symbols; |
896 | $str = (string)$str; | |
42c80841 | 897 | |
3ec62cf9 | 898 | $sample_obj = new Text_LanguageDetect_Parser($str); |
42c80841 NL |
899 | $sample_obj->prepareUnicode(); |
900 | $sample_obj->prepareTrigram(false); | |
901 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | |
902 | $sample_obj->analyze(); | |
3ec62cf9 | 903 | $blocks = $sample_obj->getUnicodeBlocks(); |
42c80841 NL |
904 | unset($sample_obj); |
905 | return $blocks; | |
906 | } | |
907 | ||
908 | /** | |
909 | * Returns the block name for a given unicode value | |
910 | * | |
3ec62cf9 | 911 | * If passed a string, will assume it is being passed a UTF8-formatted |
42c80841 NL |
912 | * character and will automatically convert. Otherwise it will assume it |
913 | * is being passed a numeric unicode value. | |
914 | * | |
915 | * Make sure input is of the correct type! | |
916 | * | |
42c80841 | 917 | * @param mixed $unicode unicode value or utf8 char |
3ec62cf9 | 918 | * |
42c80841 | 919 | * @return mixed the block name string or false if not found |
3ec62cf9 | 920 | * @throws Text_LanguageDetect_Exception |
42c80841 | 921 | */ |
3ec62cf9 MR |
922 | public function unicodeBlockName($unicode) |
923 | { | |
42c80841 NL |
924 | if (is_string($unicode)) { |
925 | // assume it is being passed a utf8 char, so convert it | |
3ec62cf9 MR |
926 | if (self::utf8strlen($unicode) > 1) { |
927 | throw new Text_LanguageDetect_Exception( | |
928 | 'Pass a single char only to this method', | |
929 | Text_LanguageDetect_Exception::PARAM_TYPE | |
930 | ); | |
42c80841 | 931 | } |
42c80841 NL |
932 | $unicode = $this->_utf8char2unicode($unicode); |
933 | ||
42c80841 | 934 | } elseif (!is_int($unicode)) { |
3ec62cf9 MR |
935 | throw new Text_LanguageDetect_Exception( |
936 | 'Input must be of type string or int.', | |
937 | Text_LanguageDetect_Exception::PARAM_TYPE | |
938 | ); | |
42c80841 NL |
939 | } |
940 | ||
3ec62cf9 | 941 | $blocks = $this->_read_unicode_block_db(); |
42c80841 NL |
942 | |
943 | $result = $this->_unicode_block_name($unicode, $blocks); | |
944 | ||
945 | if ($result == -1) { | |
946 | return false; | |
947 | } else { | |
948 | return $result[2]; | |
949 | } | |
950 | } | |
951 | ||
952 | /** | |
953 | * Searches the unicode block database | |
954 | * | |
955 | * Returns the block name for a given unicode value. unicodeBlockName() is | |
956 | * the public interface for this function, which does input checks which | |
957 | * this function omits for speed. | |
958 | * | |
3ec62cf9 MR |
959 | * @param int $unicode the unicode value |
960 | * @param array $blocks the block database | |
961 | * @param int $block_count the number of defined blocks in the database | |
962 | * | |
963 | * @return mixed Block name, -1 if it failed | |
964 | * @see unicodeBlockName() | |
965 | * @access protected | |
42c80841 | 966 | */ |
3ec62cf9 MR |
967 | function _unicode_block_name($unicode, $blocks, $block_count = -1) |
968 | { | |
969 | // for a reference, see | |
42c80841 NL |
970 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt |
971 | ||
972 | // assume that ascii characters are the most common | |
973 | // so try it first for efficiency | |
974 | if ($unicode <= $blocks[0][1]) { | |
975 | return $blocks[0]; | |
976 | } | |
977 | ||
978 | // the optional $block_count param is for efficiency | |
979 | // so we this function doesn't have to run count() every time | |
980 | if ($block_count != -1) { | |
981 | $high = $block_count - 1; | |
982 | } else { | |
983 | $high = count($blocks) - 1; | |
984 | } | |
985 | ||
986 | $low = 1; // start with 1 because ascii was 0 | |
987 | ||
988 | // your average binary search algorithm | |
989 | while ($low <= $high) { | |
990 | $mid = floor(($low + $high) / 2); | |
991 | ||
42c80841 | 992 | if ($unicode < $blocks[$mid][0]) { |
3ec62cf9 | 993 | // if it's lower than the lower bound |
42c80841 NL |
994 | $high = $mid - 1; |
995 | ||
42c80841 | 996 | } elseif ($unicode > $blocks[$mid][1]) { |
3ec62cf9 | 997 | // if it's higher than the upper bound |
42c80841 NL |
998 | $low = $mid + 1; |
999 | ||
42c80841 | 1000 | } else { |
3ec62cf9 | 1001 | // found it |
42c80841 NL |
1002 | return $blocks[$mid]; |
1003 | } | |
1004 | } | |
1005 | ||
3ec62cf9 | 1006 | // failed to find the block |
42c80841 NL |
1007 | return -1; |
1008 | ||
3ec62cf9 | 1009 | // todo: differentiate when it's out of range or when it falls |
42c80841 NL |
1010 | // into an unassigned range? |
1011 | } | |
1012 | ||
1013 | /** | |
1014 | * Brings up the unicode block database | |
1015 | * | |
42c80841 | 1016 | * @return array the database of unicode block definitions |
3ec62cf9 MR |
1017 | * @throws Text_LanguageDetect_Exception |
1018 | * @access protected | |
42c80841 | 1019 | */ |
3ec62cf9 MR |
1020 | function _read_unicode_block_db() |
1021 | { | |
42c80841 NL |
1022 | // since the unicode definitions are always going to be the same, |
1023 | // might as well share the memory for the db with all other instances | |
1024 | // of this class | |
1025 | static $data; | |
1026 | ||
1027 | if (!isset($data)) { | |
1028 | $data = $this->_readdb($this->_unicode_db_filename); | |
1029 | } | |
1030 | ||
1031 | return $data; | |
1032 | } | |
1033 | ||
1034 | /** | |
1035 | * Calculate the similarities between the language models | |
3ec62cf9 | 1036 | * |
42c80841 NL |
1037 | * Use this function to see how similar languages are to each other. |
1038 | * | |
1039 | * If passed 2 language names, will return just those languages compared. | |
1040 | * If passed 1 language name, will return that language compared to | |
1041 | * all others. | |
3ec62cf9 | 1042 | * If passed none, will return an array of every language model compared |
42c80841 NL |
1043 | * to every other one. |
1044 | * | |
3ec62cf9 MR |
1045 | * @param string $lang1 the name of the first language to be compared |
1046 | * @param string $lang2 the name of the second language to be compared | |
1047 | * | |
1048 | * @return array scores of every language compared | |
1049 | * or the score of just the provided languages | |
1050 | * or null if one of the supplied languages does not exist | |
1051 | * @throws Text_LanguageDetect_Exception | |
42c80841 | 1052 | */ |
3ec62cf9 | 1053 | public function languageSimilarity($lang1 = null, $lang2 = null) |
42c80841 | 1054 | { |
3ec62cf9 MR |
1055 | $lang1 = $this->_convertFromNameMode($lang1); |
1056 | $lang2 = $this->_convertFromNameMode($lang2); | |
42c80841 NL |
1057 | if ($lang1 != null) { |
1058 | $lang1 = strtolower($lang1); | |
1059 | ||
1060 | // check if language model exists | |
1061 | if (!isset($this->_lang_db[$lang1])) { | |
1062 | return null; | |
1063 | } | |
1064 | ||
1065 | if ($lang2 != null) { | |
3ec62cf9 MR |
1066 | if (!isset($this->_lang_db[$lang2])) { |
1067 | // check if language model exists | |
42c80841 NL |
1068 | return null; |
1069 | } | |
1070 | ||
1071 | $lang2 = strtolower($lang2); | |
1072 | ||
1073 | // compare just these two languages | |
1074 | return $this->_normalize_score( | |
1075 | $this->_distance( | |
1076 | $this->_lang_db[$lang1], | |
1077 | $this->_lang_db[$lang2] | |
1078 | ) | |
1079 | ); | |
1080 | ||
42c80841 | 1081 | } else { |
3ec62cf9 | 1082 | // compare just $lang1 to all languages |
42c80841 NL |
1083 | $return_arr = array(); |
1084 | foreach ($this->_lang_db as $key => $value) { | |
3ec62cf9 MR |
1085 | if ($key != $lang1) { |
1086 | // don't compare a language to itself | |
42c80841 | 1087 | $return_arr[$key] = $this->_normalize_score( |
3ec62cf9 MR |
1088 | $this->_distance($this->_lang_db[$lang1], $value) |
1089 | ); | |
42c80841 NL |
1090 | } |
1091 | } | |
1092 | asort($return_arr); | |
1093 | ||
1094 | return $return_arr; | |
1095 | } | |
1096 | ||
1097 | ||
42c80841 | 1098 | } else { |
3ec62cf9 | 1099 | // compare all languages to each other |
42c80841 NL |
1100 | $return_arr = array(); |
1101 | foreach (array_keys($this->_lang_db) as $lang1) { | |
1102 | foreach (array_keys($this->_lang_db) as $lang2) { | |
42c80841 | 1103 | // skip comparing languages to themselves |
3ec62cf9 | 1104 | if ($lang1 != $lang2) { |
42c80841 | 1105 | |
3ec62cf9 MR |
1106 | if (isset($return_arr[$lang2][$lang1])) { |
1107 | // don't re-calculate what's already been done | |
1108 | $return_arr[$lang1][$lang2] | |
1109 | = $return_arr[$lang2][$lang1]; | |
42c80841 | 1110 | |
42c80841 | 1111 | } else { |
3ec62cf9 MR |
1112 | // calculate |
1113 | $return_arr[$lang1][$lang2] | |
1114 | = $this->_normalize_score( | |
1115 | $this->_distance( | |
1116 | $this->_lang_db[$lang1], | |
1117 | $this->_lang_db[$lang2] | |
1118 | ) | |
42c80841 NL |
1119 | ); |
1120 | ||
1121 | } | |
1122 | } | |
1123 | } | |
1124 | } | |
1125 | return $return_arr; | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | /** | |
1130 | * Cluster known languages according to languageSimilarity() | |
1131 | * | |
1132 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1133 | * use, and it may disappear or its functionality may change in future | |
1134 | * releases without notice. | |
1135 | * | |
1136 | * Uses a nearest neighbor technique to generate the maximum possible | |
1137 | * number of dendograms from the similarity data. | |
1138 | * | |
1139 | * @access public | |
1140 | * @return array language cluster data | |
3ec62cf9 | 1141 | * @throws Text_LanguageDetect_Exception |
42c80841 | 1142 | * @see languageSimilarity() |
3ec62cf9 | 1143 | * @deprecated this function will eventually be removed and placed into |
42c80841 NL |
1144 | * the model generation class |
1145 | */ | |
1146 | function clusterLanguages() | |
1147 | { | |
1148 | // todo: set the maximum number of clusters | |
42c80841 NL |
1149 | // return cached result, if any |
1150 | if (isset($this->_clusters)) { | |
1151 | return $this->_clusters; | |
1152 | } | |
1153 | ||
1154 | $langs = array_keys($this->_lang_db); | |
1155 | ||
1156 | $arr = $this->languageSimilarity(); | |
1157 | ||
1158 | sort($langs); | |
1159 | ||
1160 | foreach ($langs as $lang) { | |
1161 | if (!isset($this->_lang_db[$lang])) { | |
3ec62cf9 MR |
1162 | throw new Text_LanguageDetect_Exception( |
1163 | "missing $lang!", | |
1164 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | |
1165 | ); | |
42c80841 NL |
1166 | } |
1167 | } | |
1168 | ||
1169 | // http://www.psychstat.missouristate.edu/multibook/mlt04m.html | |
1170 | foreach ($langs as $old_key => $lang1) { | |
1171 | $langs[$lang1] = $lang1; | |
1172 | unset($langs[$old_key]); | |
1173 | } | |
3ec62cf9 MR |
1174 | |
1175 | $result_data = $really_map = array(); | |
1176 | ||
42c80841 NL |
1177 | $i = 0; |
1178 | while (count($langs) > 2 && $i++ < 200) { | |
1179 | $highest_score = -1; | |
1180 | $highest_key1 = ''; | |
1181 | $highest_key2 = ''; | |
1182 | foreach ($langs as $lang1) { | |
1183 | foreach ($langs as $lang2) { | |
3ec62cf9 MR |
1184 | if ($lang1 != $lang2 |
1185 | && $arr[$lang1][$lang2] > $highest_score | |
1186 | ) { | |
42c80841 NL |
1187 | $highest_score = $arr[$lang1][$lang2]; |
1188 | $highest_key1 = $lang1; | |
1189 | $highest_key2 = $lang2; | |
1190 | } | |
1191 | } | |
1192 | } | |
3ec62cf9 | 1193 | |
42c80841 NL |
1194 | if (!$highest_key1) { |
1195 | // should not ever happen | |
3ec62cf9 MR |
1196 | throw new Text_LanguageDetect_Exception( |
1197 | "no highest key? (step: $i)", | |
1198 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | |
1199 | ); | |
42c80841 NL |
1200 | } |
1201 | ||
1202 | if ($highest_score == 0) { | |
1203 | // languages are perfectly dissimilar | |
1204 | break; | |
1205 | } | |
1206 | ||
1207 | // $highest_key1 and $highest_key2 are most similar | |
1208 | $sum1 = array_sum($arr[$highest_key1]); | |
1209 | $sum2 = array_sum($arr[$highest_key2]); | |
1210 | ||
3ec62cf9 | 1211 | // use the score for the one that is most similar to the rest of |
42c80841 NL |
1212 | // the field as the score for the group |
1213 | // todo: could try averaging or "centroid" method instead | |
1214 | // seems like that might make more sense | |
1215 | // actually nearest neighbor may be better for binary searching | |
1216 | ||
1217 | ||
1218 | // for "Complete Linkage"/"furthest neighbor" | |
1219 | // sign should be < | |
1220 | // for "Single Linkage"/"nearest neighbor" method | |
1221 | // should should be > | |
1222 | // results seem to be pretty much the same with either method | |
1223 | ||
1224 | // figure out which to delete and which to replace | |
1225 | if ($sum1 > $sum2) { | |
1226 | $replaceme = $highest_key1; | |
1227 | $deleteme = $highest_key2; | |
1228 | } else { | |
1229 | $replaceme = $highest_key2; | |
1230 | $deleteme = $highest_key1; | |
1231 | } | |
1232 | ||
1233 | $newkey = $replaceme . ':' . $deleteme; | |
1234 | ||
1235 | // $replaceme is most similar to remaining languages | |
1236 | // replace $replaceme with '$newkey', deleting $deleteme | |
1237 | ||
1238 | // keep a record of which fork is really which language | |
1239 | $really_lang = $replaceme; | |
1240 | while (isset($really_map[$really_lang])) { | |
1241 | $really_lang = $really_map[$really_lang]; | |
3ec62cf9 | 1242 | } |
42c80841 NL |
1243 | $really_map[$newkey] = $really_lang; |
1244 | ||
1245 | ||
1246 | // replace the best fitting key, delete the other | |
1247 | foreach ($arr as $key1 => $arr2) { | |
1248 | foreach ($arr2 as $key2 => $value2) { | |
1249 | if ($key2 == $replaceme) { | |
1250 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | |
1251 | unset($arr[$key1][$key2]); | |
1252 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | |
3ec62cf9 MR |
1253 | } |
1254 | ||
42c80841 NL |
1255 | if ($key1 == $replaceme) { |
1256 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | |
1257 | unset($arr[$key1][$key2]); | |
1258 | // replacing $arr[$key1][$key2] with $arr[$newkey][$key2] | |
1259 | } | |
1260 | ||
1261 | if ($key1 == $deleteme || $key2 == $deleteme) { | |
1262 | // deleting $arr[$key1][$key2] | |
1263 | unset($arr[$key1][$key2]); | |
1264 | } | |
1265 | } | |
1266 | } | |
3ec62cf9 | 1267 | |
42c80841 NL |
1268 | |
1269 | unset($langs[$highest_key1]); | |
1270 | unset($langs[$highest_key2]); | |
1271 | $langs[$newkey] = $newkey; | |
1272 | ||
1273 | ||
1274 | // some of these may be overkill | |
1275 | $result_data[$newkey] = array( | |
1276 | 'newkey' => $newkey, | |
1277 | 'count' => $i, | |
1278 | 'diff' => abs($sum1 - $sum2), | |
1279 | 'score' => $highest_score, | |
1280 | 'bestfit' => $replaceme, | |
1281 | 'otherfit' => $deleteme, | |
1282 | 'really' => $really_lang, | |
1283 | ); | |
1284 | } | |
1285 | ||
1286 | $return_val = array( | |
3ec62cf9 | 1287 | 'open_forks' => $langs, |
42c80841 NL |
1288 | // the top level of clusters |
1289 | // clusters that are mutually exclusive | |
1290 | // or specified by a specific maximum | |
1291 | ||
1292 | 'fork_data' => $result_data, | |
1293 | // data for each split | |
1294 | ||
1295 | 'name_map' => $really_map, | |
1296 | // which cluster is really which language | |
1297 | // using the nearest neighbor technique, the cluster | |
1298 | // inherits all of the properties of its most-similar member | |
1299 | // this keeps track | |
1300 | ); | |
1301 | ||
1302 | ||
1303 | // saves the result in the object | |
1304 | $this->_clusters = $return_val; | |
1305 | ||
1306 | return $return_val; | |
1307 | } | |
1308 | ||
1309 | ||
1310 | /** | |
1311 | * Perform an intelligent detection based on clusterLanguages() | |
1312 | * | |
1313 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1314 | * use, and it may disappear or its functionality may change in future | |
1315 | * releases without notice. | |
1316 | * | |
3ec62cf9 | 1317 | * This compares the sample text to top the top level of clusters. If the |
42c80841 NL |
1318 | * sample is similar to the cluster it will drop down and compare it to the |
1319 | * languages in the cluster, and so on until it hits a leaf node. | |
1320 | * | |
3ec62cf9 | 1321 | * this should find the language in considerably fewer compares |
42c80841 NL |
1322 | * (the equivalent of a binary search), however clusterLanguages() is costly |
1323 | * and the loss of accuracy from this technique is significant. | |
1324 | * | |
1325 | * This method may need to be 'fuzzier' in order to become more accurate. | |
1326 | * | |
1327 | * This function could be more useful if the universe of possible languages | |
1328 | * was very large, however in such cases some method of Bayesian inference | |
1329 | * might be more helpful. | |
1330 | * | |
3ec62cf9 MR |
1331 | * @param string $str input string |
1332 | * | |
1333 | * @return array language scores (only those compared) | |
1334 | * @throws Text_LanguageDetect_Exception | |
1335 | * @see clusterLanguages() | |
42c80841 | 1336 | */ |
3ec62cf9 | 1337 | public function clusteredSearch($str) |
42c80841 | 1338 | { |
42c80841 NL |
1339 | // input check |
1340 | if (!Text_LanguageDetect_Parser::validateString($str)) { | |
1341 | return array(); | |
1342 | } | |
1343 | ||
1344 | // clusterLanguages() will return a cached result if possible | |
1345 | // so it's safe to call it every time | |
1346 | $result = $this->clusterLanguages(); | |
1347 | ||
1348 | $dendogram_start = $result['open_forks']; | |
1349 | $dendogram_data = $result['fork_data']; | |
1350 | $dendogram_alias = $result['name_map']; | |
1351 | ||
3ec62cf9 | 1352 | $sample_obj = new Text_LanguageDetect_Parser($str); |
42c80841 NL |
1353 | $sample_obj->prepareTrigram(); |
1354 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
1355 | $sample_obj->analyze(); | |
1356 | $sample_result = $sample_obj->getTrigramRanks(); | |
1357 | $sample_count = count($sample_result); | |
1358 | ||
1359 | // input check | |
1360 | if ($sample_count == 0) { | |
1361 | return array(); | |
1362 | } | |
1363 | ||
1364 | $i = 0; // counts the number of steps | |
3ec62cf9 | 1365 | |
42c80841 NL |
1366 | foreach ($dendogram_start as $lang) { |
1367 | if (isset($dendogram_alias[$lang])) { | |
1368 | $lang_key = $dendogram_alias[$lang]; | |
1369 | } else { | |
1370 | $lang_key = $lang; | |
1371 | } | |
1372 | ||
1373 | $scores[$lang] = $this->_normalize_score( | |
1374 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
3ec62cf9 MR |
1375 | $sample_count |
1376 | ); | |
42c80841 NL |
1377 | |
1378 | $i++; | |
1379 | } | |
1380 | ||
1381 | if ($this->_perl_compatible) { | |
1382 | asort($scores); | |
1383 | } else { | |
1384 | arsort($scores); | |
1385 | } | |
1386 | ||
1387 | $top_score = current($scores); | |
1388 | $top_key = key($scores); | |
1389 | ||
1390 | // of starting forks, $top_key is the most similar to the sample | |
1391 | ||
1392 | $cur_key = $top_key; | |
1393 | while (isset($dendogram_data[$cur_key])) { | |
1394 | $lang1 = $dendogram_data[$cur_key]['bestfit']; | |
1395 | $lang2 = $dendogram_data[$cur_key]['otherfit']; | |
1396 | foreach (array($lang1, $lang2) as $lang) { | |
1397 | if (isset($dendogram_alias[$lang])) { | |
1398 | $lang_key = $dendogram_alias[$lang]; | |
1399 | } else { | |
1400 | $lang_key = $lang; | |
1401 | } | |
1402 | ||
1403 | $scores[$lang] = $this->_normalize_score( | |
1404 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
3ec62cf9 MR |
1405 | $sample_count |
1406 | ); | |
42c80841 NL |
1407 | |
1408 | //todo: does not need to do same comparison again | |
1409 | } | |
1410 | ||
1411 | $i++; | |
1412 | ||
1413 | if ($scores[$lang1] > $scores[$lang2]) { | |
1414 | $cur_key = $lang1; | |
1415 | $loser_key = $lang2; | |
1416 | } else { | |
1417 | $cur_key = $lang2; | |
1418 | $loser_key = $lang1; | |
1419 | } | |
1420 | ||
1421 | $diff = $scores[$cur_key] - $scores[$loser_key]; | |
1422 | ||
3ec62cf9 MR |
1423 | // $cur_key ({$dendogram_alias[$cur_key]}) wins |
1424 | // over $loser_key ({$dendogram_alias[$loser_key]}) | |
42c80841 NL |
1425 | // with a difference of $diff |
1426 | } | |
1427 | ||
1428 | // found result in $i compares | |
1429 | ||
1430 | // rather than sorting the result, preserve it so that you can see | |
1431 | // which paths the algorithm decided to take along the tree | |
1432 | ||
1433 | // but sometimes the last item is only the second highest | |
3ec62cf9 MR |
1434 | if (($this->_perl_compatible && (end($scores) > prev($scores))) |
1435 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) | |
1436 | ) { | |
42c80841 NL |
1437 | $real_last_score = current($scores); |
1438 | $real_last_key = key($scores); | |
1439 | ||
1440 | // swaps the 2nd-to-last item for the last item | |
1441 | unset($scores[$real_last_key]); | |
1442 | $scores[$real_last_key] = $real_last_score; | |
1443 | } | |
3ec62cf9 | 1444 | |
42c80841 NL |
1445 | |
1446 | if (!$this->_perl_compatible) { | |
1447 | $scores = array_reverse($scores, true); | |
1448 | // second param requires php > 4.0.3 | |
1449 | } | |
1450 | ||
1451 | return $scores; | |
1452 | } | |
1453 | ||
1454 | /** | |
1455 | * ut8-safe strlen() | |
1456 | * | |
1457 | * Returns the numbers of characters (not bytes) in a utf8 string | |
1458 | * | |
3ec62cf9 MR |
1459 | * @param string $str string to get the length of |
1460 | * | |
1461 | * @return int number of chars | |
42c80841 | 1462 | */ |
3ec62cf9 | 1463 | public static function utf8strlen($str) |
42c80841 NL |
1464 | { |
1465 | // utf8_decode() will convert unknown chars to '?', which is actually | |
1466 | // ideal for counting. | |
1467 | ||
1468 | return strlen(utf8_decode($str)); | |
1469 | ||
1470 | // idea stolen from dokuwiki | |
1471 | } | |
1472 | ||
1473 | /** | |
1474 | * Returns the unicode value of a utf8 char | |
1475 | * | |
3ec62cf9 MR |
1476 | * @param string $char a utf8 (possibly multi-byte) char |
1477 | * | |
1478 | * @return int unicode value | |
1479 | * @access protected | |
1480 | * @link http://en.wikipedia.org/wiki/UTF-8 | |
42c80841 | 1481 | */ |
3ec62cf9 MR |
1482 | function _utf8char2unicode($char) |
1483 | { | |
42c80841 NL |
1484 | // strlen() here will actually get the binary length of a single char |
1485 | switch (strlen($char)) { | |
3ec62cf9 MR |
1486 | case 1: |
1487 | // normal ASCII-7 byte | |
1488 | // 0xxxxxxx --> 0xxxxxxx | |
1489 | return ord($char{0}); | |
1490 | ||
1491 | case 2: | |
1492 | // 2 byte unicode | |
1493 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1494 | $z = (ord($char{0}) & 0x000001F) << 6; | |
1495 | $x = (ord($char{1}) & 0x0000003F); | |
1496 | return ($z | $x); | |
1497 | ||
1498 | case 3: | |
1499 | // 3 byte unicode | |
1500 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | |
1501 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1502 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | |
1503 | $x2 = (ord($char{2}) & 0x0000003F); | |
1504 | return ($z | $x1 | $x2); | |
1505 | ||
1506 | case 4: | |
1507 | // 4 byte unicode | |
1508 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1509 | // 000zzzzz xxxxxxxx xxxxxxxx | |
1510 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1511 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | |
1512 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | |
1513 | $x2 = (ord($char{3}) & 0x0000003F); | |
1514 | return ($z1 | $z2 | $x1 | $x2); | |
42c80841 NL |
1515 | } |
1516 | } | |
1517 | ||
1518 | /** | |
1519 | * utf8-safe fast character iterator | |
1520 | * | |
1521 | * Will get the next character starting from $counter, which will then be | |
3ec62cf9 | 1522 | * incremented. If a multi-byte char the bytes will be concatenated and |
42c80841 NL |
1523 | * $counter will be incremeted by the number of bytes in the char. |
1524 | * | |
3ec62cf9 MR |
1525 | * @param string $str the string being iterated over |
1526 | * @param int &$counter the iterator, will increment by reference | |
1527 | * @param bool $special_convert whether to do special conversions | |
1528 | * | |
1529 | * @return char the next (possibly multi-byte) char from $counter | |
1530 | * @access private | |
42c80841 | 1531 | */ |
3ec62cf9 | 1532 | static function _next_char($str, &$counter, $special_convert = false) |
42c80841 | 1533 | { |
42c80841 NL |
1534 | $char = $str{$counter++}; |
1535 | $ord = ord($char); | |
1536 | ||
1537 | // for a description of the utf8 system see | |
1538 | // http://www.phpclasses.org/browse/file/5131.html | |
1539 | ||
1540 | // normal ascii one byte char | |
1541 | if ($ord <= 127) { | |
42c80841 NL |
1542 | // special conversions needed for this package |
1543 | // (that only apply to regular ascii characters) | |
1544 | // lower case, and convert all non-alphanumeric characters | |
1545 | // other than "'" to space | |
1546 | if ($special_convert && $char != ' ' && $char != "'") { | |
1547 | if ($ord >= 65 && $ord <= 90) { // A-Z | |
1548 | $char = chr($ord + 32); // lower case | |
1549 | } elseif ($ord < 97 || $ord > 122) { // NOT a-z | |
1550 | $char = ' '; // convert to space | |
1551 | } | |
1552 | } | |
1553 | ||
1554 | return $char; | |
1555 | ||
42c80841 | 1556 | } elseif ($ord >> 5 == 6) { // two-byte char |
3ec62cf9 | 1557 | // multi-byte chars |
42c80841 NL |
1558 | $nextchar = $str{$counter++}; // get next byte |
1559 | ||
1560 | // lower-casing of non-ascii characters is still incomplete | |
1561 | ||
1562 | if ($special_convert) { | |
1563 | // lower case latin accented characters | |
1564 | if ($ord == 195) { | |
1565 | $nextord = ord($nextchar); | |
1566 | $nextord_adj = $nextord + 64; | |
3ec62cf9 | 1567 | // for a reference, see |
42c80841 NL |
1568 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html |
1569 | ||
1570 | // À - Þ but not × | |
3ec62cf9 MR |
1571 | if ($nextord_adj >= 192 |
1572 | && $nextord_adj <= 222 | |
1573 | && $nextord_adj != 215 | |
1574 | ) { | |
1575 | $nextchar = chr($nextord + 32); | |
42c80841 NL |
1576 | } |
1577 | ||
42c80841 | 1578 | } elseif ($ord == 208) { |
3ec62cf9 | 1579 | // lower case cyrillic alphabet |
42c80841 NL |
1580 | $nextord = ord($nextchar); |
1581 | // if A - Pe | |
1582 | if ($nextord >= 144 && $nextord <= 159) { | |
1583 | // lower case | |
1584 | $nextchar = chr($nextord + 32); | |
1585 | ||
42c80841 | 1586 | } elseif ($nextord >= 160 && $nextord <= 175) { |
3ec62cf9 | 1587 | // if Er - Ya |
42c80841 NL |
1588 | // lower case |
1589 | $char = chr(209); // == $ord++ | |
1590 | $nextchar = chr($nextord - 32); | |
1591 | } | |
1592 | } | |
1593 | } | |
1594 | ||
1595 | // tag on next byte | |
3ec62cf9 | 1596 | return $char . $nextchar; |
42c80841 | 1597 | } elseif ($ord >> 4 == 14) { // three-byte char |
3ec62cf9 | 1598 | |
42c80841 | 1599 | // tag on next 2 bytes |
3ec62cf9 | 1600 | return $char . $str{$counter++} . $str{$counter++}; |
42c80841 NL |
1601 | |
1602 | } elseif ($ord >> 3 == 30) { // four-byte char | |
1603 | ||
1604 | // tag on next 3 bytes | |
1605 | return $char . $str{$counter++} . $str{$counter++} . $str{$counter++}; | |
1606 | ||
1607 | } else { | |
1608 | // error? | |
1609 | } | |
1610 | } | |
1611 | ||
3ec62cf9 MR |
1612 | /** |
1613 | * Converts an $language input parameter from the configured mode | |
1614 | * to the language name that is used internally. | |
1615 | * | |
1616 | * Works for strings and arrays. | |
1617 | * | |
1618 | * @param string|array $lang A language description ("english"/"en"/"eng") | |
1619 | * @param boolean $convertKey If $lang is an array, setting $key | |
1620 | * converts the keys to the language name. | |
1621 | * | |
1622 | * @return string|array Language name | |
1623 | */ | |
1624 | function _convertFromNameMode($lang, $convertKey = false) | |
1625 | { | |
1626 | if ($this->_name_mode == 0) { | |
1627 | return $lang; | |
1628 | } | |
1629 | ||
1630 | if ($this->_name_mode == 2) { | |
1631 | $method = 'code2ToName'; | |
1632 | } else { | |
1633 | $method = 'code3ToName'; | |
1634 | } | |
1635 | ||
1636 | if (is_string($lang)) { | |
1637 | return (string)Text_LanguageDetect_ISO639::$method($lang); | |
1638 | } | |
1639 | ||
1640 | $newlang = array(); | |
1641 | foreach ($lang as $key => $val) { | |
1642 | if ($convertKey) { | |
1643 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | |
1644 | $newlang[$newkey] = $val; | |
1645 | } else { | |
1646 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | |
1647 | } | |
1648 | } | |
1649 | return $newlang; | |
1650 | } | |
42c80841 | 1651 | |
3ec62cf9 MR |
1652 | /** |
1653 | * Converts an $language output parameter from the language name that is | |
1654 | * used internally to the configured mode. | |
1655 | * | |
1656 | * Works for strings and arrays. | |
1657 | * | |
1658 | * @param string|array $lang A language description ("english"/"en"/"eng") | |
1659 | * @param boolean $convertKey If $lang is an array, setting $key | |
1660 | * converts the keys to the language name. | |
1661 | * | |
1662 | * @return string|array Language name | |
1663 | */ | |
1664 | function _convertToNameMode($lang, $convertKey = false) | |
1665 | { | |
1666 | if ($this->_name_mode == 0) { | |
1667 | return $lang; | |
1668 | } | |
1669 | ||
1670 | if ($this->_name_mode == 2) { | |
1671 | $method = 'nameToCode2'; | |
1672 | } else { | |
1673 | $method = 'nameToCode3'; | |
1674 | } | |
1675 | ||
1676 | if (is_string($lang)) { | |
1677 | return Text_LanguageDetect_ISO639::$method($lang); | |
1678 | } | |
1679 | ||
1680 | $newlang = array(); | |
1681 | foreach ($lang as $key => $val) { | |
1682 | if ($convertKey) { | |
1683 | $newkey = Text_LanguageDetect_ISO639::$method($key); | |
1684 | $newlang[$newkey] = $val; | |
1685 | } else { | |
1686 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | |
1687 | } | |
1688 | } | |
1689 | return $newlang; | |
1690 | } | |
1691 | } | |
42c80841 | 1692 | |
3ec62cf9 | 1693 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ |