]>
Commit | Line | Data |
---|---|---|
42c80841 NL |
1 | <?php |
2 | ||
3 | /** | |
4 | * Detects the language of a given piece of text. | |
5 | * | |
6 | * Attempts to detect the language of a sample of text by correlating ranked | |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | |
8 | * | |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | |
10 | * (1994): "N-Gram-Based Text Categorization" | |
11 | * | |
12 | * PHP versions 4 and 5 | |
13 | * | |
14 | * @category Text | |
15 | * @package Text_LanguageDetect | |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
17 | * @copyright 2005-2006 Nicholas Pisarro | |
18 | * @license http://www.debian.org/misc/bsd.license BSD | |
19 | * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ | |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
21 | * @link http://langdetect.blogspot.com/ | |
22 | */ | |
23 | ||
24 | //require_once 'PEAR.php'; | |
25 | require_once 'Parser.php'; | |
26 | ||
27 | /** | |
28 | * Language detection class | |
29 | * | |
30 | * Requires the langauge model database (lang.dat) that should have | |
31 | * accompanied this class definition in order to be instantiated. | |
32 | * | |
33 | * Example usage: | |
34 | * | |
35 | * <code> | |
36 | * require_once 'Text/LanguageDetect.php'; | |
37 | * | |
38 | * $l = new Text_LanguageDetect; | |
39 | * | |
40 | * $stdin = fopen('php://stdin', 'r'); | |
41 | * | |
42 | * echo "Supported languages:\n"; | |
43 | * | |
44 | * $langs = $l->getLanguages(); | |
45 | * if (PEAR::isError($langs)) { | |
46 | * die($langs->getMessage()); | |
47 | * } | |
48 | * | |
49 | * sort($langs); | |
50 | * echo join(', ', $langs); | |
51 | * | |
52 | * while ($line = fgets($stdin)) { | |
53 | * print_r($l->detect($line, 4)); | |
54 | * } | |
55 | * </code> | |
56 | * | |
57 | * @category Text | |
58 | * @package Text_LanguageDetect | |
59 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | |
60 | * @copyright 2005 Nicholas Pisarro | |
61 | * @license http://www.debian.org/misc/bsd.license BSD | |
62 | * @version Release: @package_version@ | |
63 | * @todo allow users to generate their own language models | |
64 | */ | |
65 | ||
66 | class Text_LanguageDetect | |
67 | { | |
68 | /** | |
69 | * The filename that stores the trigram data for the detector | |
70 | * | |
71 | * If this value starts with a slash (/) or a dot (.) the value of | |
72 | * $this->_data_dir will be ignored | |
73 | * | |
74 | * @var string | |
75 | * @access private | |
76 | */ | |
77 | var $_db_filename = './lang.dat'; | |
78 | ||
79 | /** | |
80 | * The filename that stores the unicode block definitions | |
81 | * | |
82 | * If this value starts with a slash (/) or a dot (.) the value of | |
83 | * $this->_data_dir will be ignored | |
84 | * | |
85 | * @var string | |
86 | * @access private | |
87 | */ | |
88 | var $_unicode_db_filename = './unicode_blocks.dat'; | |
89 | ||
90 | /** | |
91 | * The data directory | |
92 | * | |
93 | * Should be set by PEAR installer | |
94 | * | |
95 | * @var string | |
96 | * @access private | |
97 | */ | |
98 | var $_data_dir = '@data_dir@'; | |
99 | ||
100 | /** | |
101 | * The trigram data for comparison | |
102 | * | |
103 | * Will be loaded on start from $this->_db_filename | |
104 | * | |
105 | * May be set to a PEAR_Error object if there is an error during its | |
106 | * initialization | |
107 | * | |
108 | * @var array | |
109 | * @access private | |
110 | */ | |
111 | var $_lang_db = array(); | |
112 | ||
113 | /** | |
114 | * stores the map of the trigram data to unicode characters | |
115 | * | |
116 | * @access private | |
117 | * @var array | |
118 | */ | |
119 | var $_unicode_map; | |
120 | ||
121 | /** | |
122 | * The size of the trigram data arrays | |
123 | * | |
124 | * @var int | |
125 | * @access private | |
126 | */ | |
127 | var $_threshold = 300; | |
128 | ||
129 | /** | |
130 | * the maximum possible score. | |
131 | * | |
132 | * needed for score normalization. Different depending on the | |
133 | * perl compatibility setting | |
134 | * | |
135 | * @access private | |
136 | * @var int | |
137 | * @see setPerlCompatible() | |
138 | */ | |
139 | var $_max_score = 0; | |
140 | ||
141 | /** | |
142 | * Whether or not to simulate perl's Language::Guess exactly | |
143 | * | |
144 | * @access private | |
145 | * @var bool | |
146 | * @see setPerlCompatible() | |
147 | */ | |
148 | var $_perl_compatible = false; | |
149 | ||
150 | /** | |
151 | * Whether to use the unicode block detection to speed up processing | |
152 | * | |
153 | * @access private | |
154 | * @var bool | |
155 | */ | |
156 | var $_use_unicode_narrowing = true; | |
157 | ||
158 | /** | |
159 | * stores the result of the clustering operation | |
160 | * | |
161 | * @access private | |
162 | * @var array | |
163 | * @see clusterLanguages() | |
164 | */ | |
165 | var $_clusters; | |
166 | ||
167 | /** | |
168 | * Constructor | |
169 | * | |
170 | * Will attempt to load the language database. If it fails, you will get | |
171 | * a PEAR_Error object returned when you try to use detect() | |
172 | * | |
173 | */ | |
174 | function Text_LanguageDetect($db=null, $unicode_db=null) | |
175 | { | |
176 | if (isset($db)) $this->_db_filename = $db; | |
177 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | |
178 | ||
179 | $data = $this->_readdb($this->_db_filename); | |
180 | $this->_lang_db = $data['trigram']; | |
181 | ||
182 | if (isset($data['trigram-unicodemap'])) { | |
183 | $this->_unicode_map = $data['trigram-unicodemap']; | |
184 | } | |
185 | ||
186 | // Not yet implemented: | |
187 | if (isset($data['trigram-clusters'])) { | |
188 | $this->_clusters = $data['trigram-clusters']; | |
189 | } | |
190 | } | |
191 | ||
192 | /** | |
193 | * Returns the path to the location of the database | |
194 | * | |
195 | * @access private | |
196 | * @return string expected path to the language model database | |
197 | */ | |
198 | function _get_data_loc($fname) | |
199 | { | |
200 | return $fname; | |
201 | } | |
202 | ||
203 | /** | |
204 | * Loads the language trigram database from filename | |
205 | * | |
206 | * Trigram datbase should be a serialize()'d array | |
207 | * | |
208 | * @access private | |
209 | * @param string $fname the filename where the data is stored | |
210 | * @return array the language model data | |
211 | * @throws PEAR_Error | |
212 | */ | |
213 | function _readdb($fname) | |
214 | { | |
215 | // finds the correct data dir | |
216 | $fname = $this->_get_data_loc($fname); | |
217 | ||
218 | // input check | |
219 | if (!file_exists($fname)) { | |
220 | throw new Exception('Language database does not exist.'); | |
221 | } elseif (!is_readable($fname)) { | |
222 | throw new Exception('Language database is not readable.'); | |
223 | } | |
224 | ||
225 | if (function_exists('file_get_contents')) { | |
226 | return unserialize(file_get_contents($fname)); | |
227 | } else { | |
228 | // if you don't have file_get_contents(), | |
229 | // then this is the next fastest way | |
230 | ob_start(); | |
231 | readfile($fname); | |
232 | $contents = ob_get_contents(); | |
233 | ob_end_clean(); | |
234 | return unserialize($contents); | |
235 | } | |
236 | } | |
237 | ||
238 | ||
239 | /** | |
240 | * Checks if this object is ready to detect languages | |
241 | * | |
242 | * @access private | |
243 | * @param mixed &$err error object to be returned by reference, if any | |
244 | * @return bool true if no errors | |
245 | */ | |
246 | function _setup_ok(&$err) | |
247 | { | |
248 | if (!is_array($this->_lang_db)) { | |
249 | if (ini_get('magic_quotes_runtime')) { | |
250 | throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); | |
251 | } else { | |
252 | throw new Exception('Language database is not an array.'); | |
253 | } | |
254 | return false; | |
255 | ||
256 | } elseif (empty($this->_lang_db)) { | |
257 | throw new Exception('Language database has no elements.'); | |
258 | return false; | |
259 | ||
260 | } else { | |
261 | return true; | |
262 | } | |
263 | } | |
264 | ||
265 | /** | |
266 | * Omits languages | |
267 | * | |
268 | * Pass this function the name of or an array of names of | |
269 | * languages that you don't want considered | |
270 | * | |
271 | * If you're only expecting a limited set of languages, this can greatly | |
272 | * speed up processing | |
273 | * | |
274 | * @access public | |
275 | * @param mixed $omit_list language name or array of names to omit | |
276 | * @param bool $include_only if true will include (rather than | |
277 | * exclude) only those in the list | |
278 | * @return int number of languages successfully deleted | |
279 | * @throws PEAR_Error | |
280 | */ | |
281 | function omitLanguages($omit_list, $include_only = false) | |
282 | { | |
283 | ||
284 | // setup check | |
285 | if (!$this->_setup_ok($err)) { | |
286 | return $err; | |
287 | } | |
288 | ||
289 | $deleted = 0; | |
290 | ||
291 | // deleting the given languages | |
292 | if (!$include_only) { | |
293 | if (!is_array($omit_list)) { | |
294 | $omit_list = strtolower($omit_list); // case desensitize | |
295 | if (isset($this->_lang_db[$omit_list])) { | |
296 | unset($this->_lang_db[$omit_list]); | |
297 | $deleted++; | |
298 | } | |
299 | } else { | |
300 | foreach ($omit_list as $omit_lang) { | |
301 | if (isset($this->_lang_db[$omit_lang])) { | |
302 | unset($this->_lang_db[$omit_lang]); | |
303 | $deleted++; | |
304 | } | |
305 | } | |
306 | } | |
307 | ||
308 | // deleting all except the given languages | |
309 | } else { | |
310 | if (!is_array($omit_list)) { | |
311 | $omit_list = array($omit_list); | |
312 | } | |
313 | ||
314 | // case desensitize | |
315 | foreach ($omit_list as $key => $omit_lang) { | |
316 | $omit_list[$key] = strtolower($omit_lang); | |
317 | } | |
318 | ||
319 | foreach (array_keys($this->_lang_db) as $lang) { | |
320 | if (!in_array($lang, $omit_list)) { | |
321 | unset($this->_lang_db[$lang]); | |
322 | $deleted++; | |
323 | } | |
324 | } | |
325 | } | |
326 | ||
327 | // reset the cluster cache if the number of languages changes | |
328 | // this will then have to be recalculated | |
329 | if (isset($this->_clusters) && $deleted > 0) { | |
330 | unset($this->_clusters); | |
331 | } | |
332 | ||
333 | return $deleted; | |
334 | } | |
335 | ||
336 | ||
337 | /** | |
338 | * Returns the number of languages that this object can detect | |
339 | * | |
340 | * @access public | |
341 | * @return int the number of languages | |
342 | * @throws PEAR_Error | |
343 | */ | |
344 | function getLanguageCount() | |
345 | { | |
346 | if (!$this->_setup_ok($err)) { | |
347 | return $err; | |
348 | } else { | |
349 | return count($this->_lang_db); | |
350 | } | |
351 | } | |
352 | ||
353 | /** | |
354 | * Returns true if a given language exists | |
355 | * | |
356 | * If passed an array of names, will return true only if all exist | |
357 | * | |
358 | * @access public | |
359 | * @param mixed $lang language name or array of language names | |
360 | * @return bool true if language model exists | |
361 | * @throws PEAR_Error | |
362 | */ | |
363 | function languageExists($lang) | |
364 | { | |
365 | if (!$this->_setup_ok($err)) { | |
366 | return $err; | |
367 | } else { | |
368 | // string | |
369 | if (is_string($lang)) { | |
370 | return isset($this->_lang_db[strtolower($lang)]); | |
371 | ||
372 | // array | |
373 | } elseif (is_array($lang)) { | |
374 | foreach ($lang as $test_lang) { | |
375 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | |
376 | return false; | |
377 | } | |
378 | } | |
379 | return true; | |
380 | ||
381 | // other (error) | |
382 | } else { | |
383 | throw new Exception('Unknown type passed to languageExists()'); | |
384 | } | |
385 | } | |
386 | } | |
387 | ||
388 | /** | |
389 | * Returns the list of detectable languages | |
390 | * | |
391 | * @access public | |
392 | * @return array the names of the languages known to this object | |
393 | * @throws PEAR_Error | |
394 | */ | |
395 | function getLanguages() | |
396 | { | |
397 | if (!$this->_setup_ok($err)) { | |
398 | return $err; | |
399 | } else { | |
400 | return array_keys($this->_lang_db); | |
401 | } | |
402 | } | |
403 | ||
404 | /** | |
405 | * Make this object behave like Language::Guess | |
406 | * | |
407 | * @access public | |
408 | * @param bool $setting false to turn off perl compatibility | |
409 | */ | |
410 | function setPerlCompatible($setting = true) | |
411 | { | |
412 | if (is_bool($setting)) { // input check | |
413 | $this->_perl_compatible = $setting; | |
414 | ||
415 | if ($setting == true) { | |
416 | $this->_max_score = $this->_threshold; | |
417 | } else { | |
418 | $this->_max_score = 0; | |
419 | } | |
420 | } | |
421 | ||
422 | } | |
423 | ||
424 | /** | |
425 | * Whether to use unicode block ranges in detection | |
426 | * | |
427 | * Should speed up most detections if turned on (detault is on). In some | |
428 | * circumstances it may be slower, such as for large text samples (> 10K) | |
429 | * in languages that use latin scripts. In other cases it should speed up | |
430 | * detection noticeably. | |
431 | * | |
432 | * @access public | |
433 | * @param bool $setting false to turn off | |
434 | */ | |
435 | function useUnicodeBlocks($setting = true) | |
436 | { | |
437 | if (is_bool($setting)) { | |
438 | $this->_use_unicode_narrowing = $setting; | |
439 | } | |
440 | } | |
441 | ||
442 | /** | |
443 | * Converts a piece of text into trigrams | |
444 | * | |
445 | * Superceded by the Text_LanguageDetect_Parser class | |
446 | * | |
447 | * @access private | |
448 | * @param string $text text to convert | |
449 | * @return array array of trigram frequencies | |
450 | */ | |
451 | function _trigram($text) | |
452 | { | |
453 | $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); | |
454 | $s->prepareTrigram(); | |
455 | $s->prepareUnicode(false); | |
456 | $s->setPadStart(!$this->_perl_compatible); | |
457 | $s->analyze(); | |
458 | return $s->getTrigramFreqs(); | |
459 | } | |
460 | ||
461 | /** | |
462 | * Converts a set of trigrams from frequencies to ranks | |
463 | * | |
464 | * Thresholds (cuts off) the list at $this->_threshold | |
465 | * | |
466 | * @access protected | |
467 | * @param array $arr array of trgram | |
468 | * @return array ranks of trigrams | |
469 | */ | |
470 | function _arr_rank(&$arr) | |
471 | { | |
472 | ||
473 | // sorts alphabetically first as a standard way of breaking rank ties | |
474 | $this->_bub_sort($arr); | |
475 | ||
476 | // below might also work, but seemed to introduce errors in testing | |
477 | //ksort($arr); | |
478 | //asort($arr); | |
479 | ||
480 | $rank = array(); | |
481 | ||
482 | $i = 0; | |
483 | foreach ($arr as $key => $value) { | |
484 | $rank[$key] = $i++; | |
485 | ||
486 | // cut off at a standard threshold | |
487 | if ($i >= $this->_threshold) { | |
488 | break; | |
489 | } | |
490 | } | |
491 | ||
492 | return $rank; | |
493 | } | |
494 | ||
495 | /** | |
496 | * Sorts an array by value breaking ties alphabetically | |
497 | * | |
498 | * @access private | |
499 | * @param array &$arr the array to sort | |
500 | */ | |
501 | function _bub_sort(&$arr) | |
502 | { | |
503 | // should do the same as this perl statement: | |
504 | // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | |
505 | ||
506 | // needs to sort by both key and value at once | |
507 | // using the key to break ties for the value | |
508 | ||
509 | // converts array into an array of arrays of each key and value | |
510 | // may be a better way of doing this | |
511 | $combined = array(); | |
512 | ||
513 | foreach ($arr as $key => $value) { | |
514 | $combined[] = array($key, $value); | |
515 | } | |
516 | ||
517 | usort($combined, array($this, '_sort_func')); | |
518 | ||
519 | $replacement = array(); | |
520 | foreach ($combined as $key => $value) { | |
521 | list($new_key, $new_value) = $value; | |
522 | $replacement[$new_key] = $new_value; | |
523 | } | |
524 | ||
525 | $arr = $replacement; | |
526 | } | |
527 | ||
528 | /** | |
529 | * Sort function used by bubble sort | |
530 | * | |
531 | * Callback function for usort(). | |
532 | * | |
533 | * @access private | |
534 | * @param array first param passed by usort() | |
535 | * @param array second param passed by usort() | |
536 | * @return int 1 if $a is greater, -1 if not | |
537 | * @see _bub_sort() | |
538 | */ | |
539 | function _sort_func($a, $b) | |
540 | { | |
541 | // each is actually a key/value pair, so that it can compare using both | |
542 | list($a_key, $a_value) = $a; | |
543 | list($b_key, $b_value) = $b; | |
544 | ||
545 | // if the values are the same, break ties using the key | |
546 | if ($a_value == $b_value) { | |
547 | return strcmp($a_key, $b_key); | |
548 | ||
549 | // if not, just sort normally | |
550 | } else { | |
551 | if ($a_value > $b_value) { | |
552 | return -1; | |
553 | } else { | |
554 | return 1; | |
555 | } | |
556 | } | |
557 | ||
558 | // 0 should not be possible because keys must be unique | |
559 | } | |
560 | ||
561 | /** | |
562 | * Calculates a linear rank-order distance statistic between two sets of | |
563 | * ranked trigrams | |
564 | * | |
565 | * Sums the differences in rank for each trigram. If the trigram does not | |
566 | * appear in both, consider it a difference of $this->_threshold. | |
567 | * | |
568 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | |
569 | * its simplicity it has been shown to be highly accurate for language | |
570 | * identification tasks. | |
571 | * | |
572 | * @access private | |
573 | * @param array $arr1 the reference set of trigram ranks | |
574 | * @param array $arr2 the target set of trigram ranks | |
575 | * @return int the sum of the differences between the ranks of | |
576 | * the two trigram sets | |
577 | */ | |
578 | function _distance(&$arr1, &$arr2) | |
579 | { | |
580 | $sumdist = 0; | |
581 | ||
582 | foreach ($arr2 as $key => $value) { | |
583 | if (isset($arr1[$key])) { | |
584 | $distance = abs($value - $arr1[$key]); | |
585 | } else { | |
586 | // $this->_threshold sets the maximum possible distance value | |
587 | // for any one pair of trigrams | |
588 | $distance = $this->_threshold; | |
589 | } | |
590 | $sumdist += $distance; | |
591 | } | |
592 | ||
593 | return $sumdist; | |
594 | ||
595 | // todo: there are other distance statistics to try, e.g. relative | |
596 | // entropy, but they're probably more costly to compute | |
597 | } | |
598 | ||
599 | /** | |
600 | * Normalizes the score returned by _distance() | |
601 | * | |
602 | * Different if perl compatible or not | |
603 | * | |
604 | * @access private | |
605 | * @param int $score the score from _distance() | |
606 | * @param int $base_count the number of trigrams being considered | |
607 | * @return float the normalized score | |
608 | * @see _distance() | |
609 | */ | |
610 | function _normalize_score($score, $base_count = null) | |
611 | { | |
612 | if ($base_count === null) { | |
613 | $base_count = $this->_threshold; | |
614 | } | |
615 | ||
616 | if (!$this->_perl_compatible) { | |
617 | return 1 - ($score / $base_count / $this->_threshold); | |
618 | } else { | |
619 | return floor($score / $base_count); | |
620 | } | |
621 | } | |
622 | ||
623 | ||
624 | /** | |
625 | * Detects the closeness of a sample of text to the known languages | |
626 | * | |
627 | * Calculates the statistical difference between the text and | |
628 | * the trigrams for each language, normalizes the score then | |
629 | * returns results for all languages in sorted order | |
630 | * | |
631 | * If perl compatible, the score is 300-0, 0 being most similar. | |
632 | * Otherwise, it's 0-1 with 1 being most similar. | |
633 | * | |
634 | * The $sample text should be at least a few sentences in length; | |
635 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | |
636 | * is present it will try to detect and convert. However, experience has | |
637 | * shown that mb_detect_encoding() *does not work very well* with at least | |
638 | * some types of encoding. | |
639 | * | |
640 | * @access public | |
641 | * @param string $sample a sample of text to compare. | |
642 | * @param int $limit if specified, return an array of the most likely | |
643 | * $limit languages and their scores. | |
644 | * @return mixed sorted array of language scores, blank array if no | |
645 | * useable text was found, or PEAR_Error if error | |
646 | * with the object setup | |
647 | * @see _distance() | |
648 | * @throws PEAR_Error | |
649 | */ | |
650 | function detect($sample, $limit = 0) | |
651 | { | |
652 | if (!$this->_setup_ok($err)) { | |
653 | return $err; | |
654 | } | |
655 | ||
656 | // input check | |
657 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | |
658 | return array(); | |
659 | } | |
660 | ||
661 | // check char encoding | |
662 | // (only if mbstring extension is compiled and PHP > 4.0.6) | |
663 | if (function_exists('mb_detect_encoding') | |
664 | && function_exists('mb_convert_encoding')) { | |
665 | ||
666 | // mb_detect_encoding isn't very reliable, to say the least | |
667 | // detection should still work with a sufficient sample of ascii characters | |
668 | $encoding = mb_detect_encoding($sample); | |
669 | ||
670 | // mb_detect_encoding() will return FALSE if detection fails | |
671 | // don't attempt conversion if that's the case | |
672 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { | |
673 | ||
674 | if (function_exists('mb_list_encodings')) { | |
675 | ||
676 | // verify the encoding exists in mb_list_encodings | |
677 | if (in_array($encoding, mb_list_encodings())) { | |
678 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | |
679 | } | |
680 | ||
681 | // if the previous condition failed: | |
682 | // somehow we detected an encoding that also we don't support | |
683 | ||
684 | } else { | |
685 | // php 4 doesnt have mb_list_encodings() | |
686 | // so attempt with error suppression | |
687 | $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); | |
688 | } | |
689 | } | |
690 | } | |
691 | ||
692 | $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); | |
693 | $sample_obj->prepareTrigram(); | |
694 | if ($this->_use_unicode_narrowing) { | |
695 | $sample_obj->prepareUnicode(); | |
696 | } | |
697 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
698 | $sample_obj->analyze(); | |
699 | ||
700 | $trigram_freqs =& $sample_obj->getTrigramRanks(); | |
701 | $trigram_count = count($trigram_freqs); | |
702 | ||
703 | if ($trigram_count == 0) { | |
704 | return array(); | |
705 | } | |
706 | ||
707 | $scores = array(); | |
708 | ||
709 | // use unicode block detection to narrow down the possibilities | |
710 | if ($this->_use_unicode_narrowing) { | |
711 | $blocks =& $sample_obj->getUnicodeBlocks(); | |
712 | ||
713 | if (is_array($blocks)) { | |
714 | $present_blocks = array_keys($blocks); | |
715 | } else { | |
716 | throw new Exception('Error during block detection'); | |
717 | } | |
718 | ||
719 | $possible_langs = array(); | |
720 | ||
721 | foreach ($present_blocks as $blockname) { | |
722 | if (isset($this->_unicode_map[$blockname])) { | |
723 | ||
724 | $possible_langs = array_merge( | |
725 | $possible_langs, | |
726 | array_keys($this->_unicode_map[$blockname]) | |
727 | ); | |
728 | ||
729 | // todo: faster way to do this? | |
730 | } | |
731 | } | |
732 | ||
733 | // could also try an intersect operation rather than a union | |
734 | // in other words, choose languages whose trigrams contain | |
735 | // ALL of the unicode blocks found in this sample | |
736 | // would improve speed but would be completely thrown off by an | |
737 | // unexpected character, like an umlaut appearing in english text | |
738 | ||
739 | $possible_langs = array_intersect( | |
740 | array_keys($this->_lang_db), | |
741 | array_unique($possible_langs) | |
742 | ); | |
743 | ||
744 | // needs to intersect it with the keys of _lang_db in case | |
745 | // languages have been omitted | |
746 | ||
747 | // or just try 'em all | |
748 | } else { | |
749 | $possible_langs = array_keys($this->_lang_db); | |
750 | } | |
751 | ||
752 | ||
753 | foreach ($possible_langs as $lang) { | |
754 | $scores[$lang] = | |
755 | $this->_normalize_score( | |
756 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | |
757 | $trigram_count); | |
758 | } | |
759 | ||
760 | unset($sample_obj); | |
761 | ||
762 | if ($this->_perl_compatible) { | |
763 | asort($scores); | |
764 | } else { | |
765 | arsort($scores); | |
766 | } | |
767 | ||
768 | // todo: drop languages with a score of $this->_max_score? | |
769 | ||
770 | // limit the number of returned scores | |
771 | if ($limit && is_numeric($limit)) { | |
772 | $limited_scores = array(); | |
773 | ||
774 | $i = 0; | |
775 | ||
776 | foreach ($scores as $key => $value) { | |
777 | if ($i++ >= $limit) { | |
778 | break; | |
779 | } | |
780 | ||
781 | $limited_scores[$key] = $value; | |
782 | } | |
783 | ||
784 | return $limited_scores; | |
785 | } else { | |
786 | return $scores; | |
787 | } | |
788 | } | |
789 | ||
790 | /** | |
791 | * Returns only the most similar language to the text sample | |
792 | * | |
793 | * Calls $this->detect() and returns only the top result | |
794 | * | |
795 | * @access public | |
796 | * @param string $sample text to detect the language of | |
797 | * @return string the name of the most likely language | |
798 | * or null if no language is similar | |
799 | * @see detect() | |
800 | * @throws PEAR_Error | |
801 | */ | |
802 | function detectSimple($sample) | |
803 | { | |
804 | $scores = $this->detect($sample, 1); | |
805 | ||
806 | // if top language has the maximum possible score, | |
807 | // then the top score will have been picked at random | |
808 | if ( !is_array($scores) | |
809 | || empty($scores) | |
810 | || current($scores) == $this->_max_score) { | |
811 | ||
812 | return null; | |
813 | ||
814 | } else { | |
815 | return ucfirst(key($scores)); | |
816 | } | |
817 | } | |
818 | ||
819 | /** | |
820 | * Returns an array containing the most similar language and a confidence | |
821 | * rating | |
822 | * | |
823 | * Confidence is a simple measure calculated from the similarity score | |
824 | * minus the similarity score from the next most similar language | |
825 | * divided by the highest possible score. Languages that have closely | |
826 | * related cousins (e.g. Norwegian and Danish) should generally have lower | |
827 | * confidence scores. | |
828 | * | |
829 | * The similarity score answers the question "How likely is the text the | |
830 | * returned language regardless of the other languages considered?" The | |
831 | * confidence score is one way of answering the question "how likely is the | |
832 | * text the detected language relative to the rest of the language model | |
833 | * set?" | |
834 | * | |
835 | * To see how similar languages are a priori, see languageSimilarity() | |
836 | * | |
837 | * @access public | |
838 | * @param string $sample text for which language will be detected | |
839 | * @return array most similar language, score and confidence rating | |
840 | * or null if no language is similar | |
841 | * @see detect() | |
842 | * @throws PEAR_Error | |
843 | */ | |
844 | function detectConfidence($sample) | |
845 | { | |
846 | $scores = $this->detect($sample, 2); | |
847 | ||
848 | // if most similar language has the max score, it | |
849 | // will have been picked at random | |
850 | if ( !is_array($scores) | |
851 | || empty($scores) | |
852 | || current($scores) == $this->_max_score) { | |
853 | ||
854 | return null; | |
855 | } | |
856 | ||
857 | $arr['language'] = ucfirst(key($scores)); | |
858 | $arr['similarity'] = current($scores); | |
859 | if (next($scores) !== false) { // if false then no next element | |
860 | // the goal is to return a higher value if the distance between | |
861 | // the similarity of the first score and the second score is high | |
862 | ||
863 | if ($this->_perl_compatible) { | |
864 | ||
865 | $arr['confidence'] = | |
866 | (current($scores) - $arr['similarity']) / $this->_max_score; | |
867 | ||
868 | } else { | |
869 | ||
870 | $arr['confidence'] = $arr['similarity'] - current($scores); | |
871 | ||
872 | } | |
873 | ||
874 | } else { | |
875 | $arr['confidence'] = null; | |
876 | } | |
877 | ||
878 | return $arr; | |
879 | } | |
880 | ||
881 | /** | |
882 | * Returns the distribution of unicode blocks in a given utf8 string | |
883 | * | |
884 | * For the block name of a single char, use unicodeBlockName() | |
885 | * | |
886 | * @access public | |
887 | * @param string $str input string. Must be ascii or utf8 | |
888 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | |
889 | * non-printing characters. Includes spaces, | |
890 | * newlines and common punctutation characters. | |
891 | * @return array | |
892 | * @throws PEAR_Error | |
893 | */ | |
894 | function detectUnicodeBlocks($str, $skip_symbols) | |
895 | { | |
896 | // input check | |
897 | if (!is_bool($skip_symbols)) { | |
898 | throw new Exception('Second parameter must be boolean'); | |
899 | } | |
900 | ||
901 | if (!is_string($str)) { | |
902 | throw new Exception('First parameter was not a string'); | |
903 | } | |
904 | ||
905 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | |
906 | $sample_obj->prepareUnicode(); | |
907 | $sample_obj->prepareTrigram(false); | |
908 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | |
909 | $sample_obj->analyze(); | |
910 | $blocks =& $sample_obj->getUnicodeBlocks(); | |
911 | unset($sample_obj); | |
912 | return $blocks; | |
913 | } | |
914 | ||
915 | /** | |
916 | * Returns the block name for a given unicode value | |
917 | * | |
918 | * If passed a string, will assume it is being passed a UTF8-formatted | |
919 | * character and will automatically convert. Otherwise it will assume it | |
920 | * is being passed a numeric unicode value. | |
921 | * | |
922 | * Make sure input is of the correct type! | |
923 | * | |
924 | * @access public | |
925 | * @param mixed $unicode unicode value or utf8 char | |
926 | * @return mixed the block name string or false if not found | |
927 | * @throws PEAR_Error | |
928 | */ | |
929 | function unicodeBlockName($unicode) { | |
930 | if (is_string($unicode)) { | |
931 | // assume it is being passed a utf8 char, so convert it | |
932 | ||
933 | // input check | |
934 | if ($this->utf8strlen($unicode) > 1) { | |
935 | throw new Exception('Pass this function only a single char'); | |
936 | } | |
937 | ||
938 | $unicode = $this->_utf8char2unicode($unicode); | |
939 | ||
940 | if ($unicode == -1) { | |
941 | throw new Exception('Malformatted char'); | |
942 | } | |
943 | ||
944 | // input check | |
945 | } elseif (!is_int($unicode)) { | |
946 | throw new Exception('Input must be of type string or int.'); | |
947 | } | |
948 | ||
949 | $blocks =& $this->_read_unicode_block_db(); | |
950 | ||
951 | $result = $this->_unicode_block_name($unicode, $blocks); | |
952 | ||
953 | if ($result == -1) { | |
954 | return false; | |
955 | } else { | |
956 | return $result[2]; | |
957 | } | |
958 | } | |
959 | ||
960 | /** | |
961 | * Searches the unicode block database | |
962 | * | |
963 | * Returns the block name for a given unicode value. unicodeBlockName() is | |
964 | * the public interface for this function, which does input checks which | |
965 | * this function omits for speed. | |
966 | * | |
967 | * @access protected | |
968 | * @param int $unicode the unicode value | |
969 | * @param array &$blocks the block database | |
970 | * @param int $block_count the number of defined blocks in the database | |
971 | * @see unicodeBlockName() | |
972 | */ | |
973 | function _unicode_block_name($unicode, &$blocks, $block_count = -1) { | |
974 | // for a reference, see | |
975 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | |
976 | ||
977 | // assume that ascii characters are the most common | |
978 | // so try it first for efficiency | |
979 | if ($unicode <= $blocks[0][1]) { | |
980 | return $blocks[0]; | |
981 | } | |
982 | ||
983 | // the optional $block_count param is for efficiency | |
984 | // so we this function doesn't have to run count() every time | |
985 | if ($block_count != -1) { | |
986 | $high = $block_count - 1; | |
987 | } else { | |
988 | $high = count($blocks) - 1; | |
989 | } | |
990 | ||
991 | $low = 1; // start with 1 because ascii was 0 | |
992 | ||
993 | // your average binary search algorithm | |
994 | while ($low <= $high) { | |
995 | $mid = floor(($low + $high) / 2); | |
996 | ||
997 | // if it's lower than the lower bound | |
998 | if ($unicode < $blocks[$mid][0]) { | |
999 | $high = $mid - 1; | |
1000 | ||
1001 | // if it's higher than the upper bound | |
1002 | } elseif ($unicode > $blocks[$mid][1]) { | |
1003 | $low = $mid + 1; | |
1004 | ||
1005 | // found it | |
1006 | } else { | |
1007 | return $blocks[$mid]; | |
1008 | } | |
1009 | } | |
1010 | ||
1011 | // failed to find the block | |
1012 | return -1; | |
1013 | ||
1014 | // todo: differentiate when it's out of range or when it falls | |
1015 | // into an unassigned range? | |
1016 | } | |
1017 | ||
1018 | /** | |
1019 | * Brings up the unicode block database | |
1020 | * | |
1021 | * @access protected | |
1022 | * @return array the database of unicode block definitions | |
1023 | * @throws PEAR_Error | |
1024 | */ | |
1025 | function &_read_unicode_block_db() { | |
1026 | // since the unicode definitions are always going to be the same, | |
1027 | // might as well share the memory for the db with all other instances | |
1028 | // of this class | |
1029 | static $data; | |
1030 | ||
1031 | if (!isset($data)) { | |
1032 | $data = $this->_readdb($this->_unicode_db_filename); | |
1033 | } | |
1034 | ||
1035 | return $data; | |
1036 | } | |
1037 | ||
1038 | /** | |
1039 | * Calculate the similarities between the language models | |
1040 | * | |
1041 | * Use this function to see how similar languages are to each other. | |
1042 | * | |
1043 | * If passed 2 language names, will return just those languages compared. | |
1044 | * If passed 1 language name, will return that language compared to | |
1045 | * all others. | |
1046 | * If passed none, will return an array of every language model compared | |
1047 | * to every other one. | |
1048 | * | |
1049 | * @access public | |
1050 | * @param string $lang1 the name of the first language to be compared | |
1051 | * @param string $lang2 the name of the second language to be compared | |
1052 | * @return array scores of every language compared | |
1053 | * or the score of just the provided languages | |
1054 | * or null if one of the supplied languages does not exist | |
1055 | * @throws PEAR_Error | |
1056 | */ | |
1057 | function languageSimilarity($lang1 = null, $lang2 = null) | |
1058 | { | |
1059 | if (!$this->_setup_ok($err)) { | |
1060 | return $err; | |
1061 | } | |
1062 | ||
1063 | if ($lang1 != null) { | |
1064 | $lang1 = strtolower($lang1); | |
1065 | ||
1066 | // check if language model exists | |
1067 | if (!isset($this->_lang_db[$lang1])) { | |
1068 | return null; | |
1069 | } | |
1070 | ||
1071 | if ($lang2 != null) { | |
1072 | ||
1073 | // can't only set the second param | |
1074 | if ($lang1 == null) { | |
1075 | return null; | |
1076 | // check if language model exists | |
1077 | } elseif (!isset($this->_lang_db[$lang2])) { | |
1078 | return null; | |
1079 | } | |
1080 | ||
1081 | $lang2 = strtolower($lang2); | |
1082 | ||
1083 | // compare just these two languages | |
1084 | return $this->_normalize_score( | |
1085 | $this->_distance( | |
1086 | $this->_lang_db[$lang1], | |
1087 | $this->_lang_db[$lang2] | |
1088 | ) | |
1089 | ); | |
1090 | ||
1091 | ||
1092 | // compare just $lang1 to all languages | |
1093 | } else { | |
1094 | $return_arr = array(); | |
1095 | foreach ($this->_lang_db as $key => $value) { | |
1096 | if ($key != $lang1) { // don't compare a language to itself | |
1097 | $return_arr[$key] = $this->_normalize_score( | |
1098 | $this->_distance($this->_lang_db[$lang1], $value)); | |
1099 | } | |
1100 | } | |
1101 | asort($return_arr); | |
1102 | ||
1103 | return $return_arr; | |
1104 | } | |
1105 | ||
1106 | ||
1107 | // compare all languages to each other | |
1108 | } else { | |
1109 | $return_arr = array(); | |
1110 | foreach (array_keys($this->_lang_db) as $lang1) { | |
1111 | foreach (array_keys($this->_lang_db) as $lang2) { | |
1112 | ||
1113 | // skip comparing languages to themselves | |
1114 | if ($lang1 != $lang2) { | |
1115 | ||
1116 | // don't re-calculate what's already been done | |
1117 | if (isset($return_arr[$lang2][$lang1])) { | |
1118 | ||
1119 | $return_arr[$lang1][$lang2] = | |
1120 | $return_arr[$lang2][$lang1]; | |
1121 | ||
1122 | // calculate | |
1123 | } else { | |
1124 | ||
1125 | $return_arr[$lang1][$lang2] = | |
1126 | $this->_normalize_score( | |
1127 | $this->_distance( | |
1128 | $this->_lang_db[$lang1], | |
1129 | $this->_lang_db[$lang2] | |
1130 | ) | |
1131 | ); | |
1132 | ||
1133 | } | |
1134 | } | |
1135 | } | |
1136 | } | |
1137 | return $return_arr; | |
1138 | } | |
1139 | } | |
1140 | ||
1141 | /** | |
1142 | * Cluster known languages according to languageSimilarity() | |
1143 | * | |
1144 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1145 | * use, and it may disappear or its functionality may change in future | |
1146 | * releases without notice. | |
1147 | * | |
1148 | * Uses a nearest neighbor technique to generate the maximum possible | |
1149 | * number of dendograms from the similarity data. | |
1150 | * | |
1151 | * @access public | |
1152 | * @return array language cluster data | |
1153 | * @throws PEAR_Error | |
1154 | * @see languageSimilarity() | |
1155 | * @deprecated this function will eventually be removed and placed into | |
1156 | * the model generation class | |
1157 | */ | |
1158 | function clusterLanguages() | |
1159 | { | |
1160 | // todo: set the maximum number of clusters | |
1161 | ||
1162 | // setup check | |
1163 | if (!$this->_setup_ok($err)) { | |
1164 | return $err; | |
1165 | } | |
1166 | ||
1167 | // return cached result, if any | |
1168 | if (isset($this->_clusters)) { | |
1169 | return $this->_clusters; | |
1170 | } | |
1171 | ||
1172 | $langs = array_keys($this->_lang_db); | |
1173 | ||
1174 | $arr = $this->languageSimilarity(); | |
1175 | ||
1176 | sort($langs); | |
1177 | ||
1178 | foreach ($langs as $lang) { | |
1179 | if (!isset($this->_lang_db[$lang])) { | |
1180 | throw new Exception("missing $lang!\n"); | |
1181 | } | |
1182 | } | |
1183 | ||
1184 | // http://www.psychstat.missouristate.edu/multibook/mlt04m.html | |
1185 | foreach ($langs as $old_key => $lang1) { | |
1186 | $langs[$lang1] = $lang1; | |
1187 | unset($langs[$old_key]); | |
1188 | } | |
1189 | ||
1190 | $i = 0; | |
1191 | while (count($langs) > 2 && $i++ < 200) { | |
1192 | $highest_score = -1; | |
1193 | $highest_key1 = ''; | |
1194 | $highest_key2 = ''; | |
1195 | foreach ($langs as $lang1) { | |
1196 | foreach ($langs as $lang2) { | |
1197 | if ( $lang1 != $lang2 | |
1198 | && $arr[$lang1][$lang2] > $highest_score) { | |
1199 | $highest_score = $arr[$lang1][$lang2]; | |
1200 | $highest_key1 = $lang1; | |
1201 | $highest_key2 = $lang2; | |
1202 | } | |
1203 | } | |
1204 | } | |
1205 | ||
1206 | if (!$highest_key1) { | |
1207 | // should not ever happen | |
1208 | throw new Exception("no highest key? (step: $i)"); | |
1209 | } | |
1210 | ||
1211 | if ($highest_score == 0) { | |
1212 | // languages are perfectly dissimilar | |
1213 | break; | |
1214 | } | |
1215 | ||
1216 | // $highest_key1 and $highest_key2 are most similar | |
1217 | $sum1 = array_sum($arr[$highest_key1]); | |
1218 | $sum2 = array_sum($arr[$highest_key2]); | |
1219 | ||
1220 | // use the score for the one that is most similar to the rest of | |
1221 | // the field as the score for the group | |
1222 | // todo: could try averaging or "centroid" method instead | |
1223 | // seems like that might make more sense | |
1224 | // actually nearest neighbor may be better for binary searching | |
1225 | ||
1226 | ||
1227 | // for "Complete Linkage"/"furthest neighbor" | |
1228 | // sign should be < | |
1229 | // for "Single Linkage"/"nearest neighbor" method | |
1230 | // should should be > | |
1231 | // results seem to be pretty much the same with either method | |
1232 | ||
1233 | // figure out which to delete and which to replace | |
1234 | if ($sum1 > $sum2) { | |
1235 | $replaceme = $highest_key1; | |
1236 | $deleteme = $highest_key2; | |
1237 | } else { | |
1238 | $replaceme = $highest_key2; | |
1239 | $deleteme = $highest_key1; | |
1240 | } | |
1241 | ||
1242 | $newkey = $replaceme . ':' . $deleteme; | |
1243 | ||
1244 | // $replaceme is most similar to remaining languages | |
1245 | // replace $replaceme with '$newkey', deleting $deleteme | |
1246 | ||
1247 | // keep a record of which fork is really which language | |
1248 | $really_lang = $replaceme; | |
1249 | while (isset($really_map[$really_lang])) { | |
1250 | $really_lang = $really_map[$really_lang]; | |
1251 | } | |
1252 | $really_map[$newkey] = $really_lang; | |
1253 | ||
1254 | ||
1255 | // replace the best fitting key, delete the other | |
1256 | foreach ($arr as $key1 => $arr2) { | |
1257 | foreach ($arr2 as $key2 => $value2) { | |
1258 | if ($key2 == $replaceme) { | |
1259 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | |
1260 | unset($arr[$key1][$key2]); | |
1261 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | |
1262 | } | |
1263 | ||
1264 | if ($key1 == $replaceme) { | |
1265 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | |
1266 | unset($arr[$key1][$key2]); | |
1267 | // replacing $arr[$key1][$key2] with $arr[$newkey][$key2] | |
1268 | } | |
1269 | ||
1270 | if ($key1 == $deleteme || $key2 == $deleteme) { | |
1271 | // deleting $arr[$key1][$key2] | |
1272 | unset($arr[$key1][$key2]); | |
1273 | } | |
1274 | } | |
1275 | } | |
1276 | ||
1277 | ||
1278 | unset($langs[$highest_key1]); | |
1279 | unset($langs[$highest_key2]); | |
1280 | $langs[$newkey] = $newkey; | |
1281 | ||
1282 | ||
1283 | // some of these may be overkill | |
1284 | $result_data[$newkey] = array( | |
1285 | 'newkey' => $newkey, | |
1286 | 'count' => $i, | |
1287 | 'diff' => abs($sum1 - $sum2), | |
1288 | 'score' => $highest_score, | |
1289 | 'bestfit' => $replaceme, | |
1290 | 'otherfit' => $deleteme, | |
1291 | 'really' => $really_lang, | |
1292 | ); | |
1293 | } | |
1294 | ||
1295 | $return_val = array( | |
1296 | 'open_forks' => $langs, | |
1297 | // the top level of clusters | |
1298 | // clusters that are mutually exclusive | |
1299 | // or specified by a specific maximum | |
1300 | ||
1301 | 'fork_data' => $result_data, | |
1302 | // data for each split | |
1303 | ||
1304 | 'name_map' => $really_map, | |
1305 | // which cluster is really which language | |
1306 | // using the nearest neighbor technique, the cluster | |
1307 | // inherits all of the properties of its most-similar member | |
1308 | // this keeps track | |
1309 | ); | |
1310 | ||
1311 | ||
1312 | // saves the result in the object | |
1313 | $this->_clusters = $return_val; | |
1314 | ||
1315 | return $return_val; | |
1316 | } | |
1317 | ||
1318 | ||
1319 | /** | |
1320 | * Perform an intelligent detection based on clusterLanguages() | |
1321 | * | |
1322 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | |
1323 | * use, and it may disappear or its functionality may change in future | |
1324 | * releases without notice. | |
1325 | * | |
1326 | * This compares the sample text to top the top level of clusters. If the | |
1327 | * sample is similar to the cluster it will drop down and compare it to the | |
1328 | * languages in the cluster, and so on until it hits a leaf node. | |
1329 | * | |
1330 | * this should find the language in considerably fewer compares | |
1331 | * (the equivalent of a binary search), however clusterLanguages() is costly | |
1332 | * and the loss of accuracy from this technique is significant. | |
1333 | * | |
1334 | * This method may need to be 'fuzzier' in order to become more accurate. | |
1335 | * | |
1336 | * This function could be more useful if the universe of possible languages | |
1337 | * was very large, however in such cases some method of Bayesian inference | |
1338 | * might be more helpful. | |
1339 | * | |
1340 | * @see clusterLanguages() | |
1341 | * @access public | |
1342 | * @param string $str input string | |
1343 | * @return array language scores (only those compared) | |
1344 | * @throws PEAR_Error | |
1345 | */ | |
1346 | function clusteredSearch($str) | |
1347 | { | |
1348 | ||
1349 | // input check | |
1350 | if (!Text_LanguageDetect_Parser::validateString($str)) { | |
1351 | return array(); | |
1352 | } | |
1353 | ||
1354 | // clusterLanguages() will return a cached result if possible | |
1355 | // so it's safe to call it every time | |
1356 | $result = $this->clusterLanguages(); | |
1357 | ||
1358 | $dendogram_start = $result['open_forks']; | |
1359 | $dendogram_data = $result['fork_data']; | |
1360 | $dendogram_alias = $result['name_map']; | |
1361 | ||
1362 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | |
1363 | $sample_obj->prepareTrigram(); | |
1364 | $sample_obj->setPadStart(!$this->_perl_compatible); | |
1365 | $sample_obj->analyze(); | |
1366 | $sample_result = $sample_obj->getTrigramRanks(); | |
1367 | $sample_count = count($sample_result); | |
1368 | ||
1369 | // input check | |
1370 | if ($sample_count == 0) { | |
1371 | return array(); | |
1372 | } | |
1373 | ||
1374 | $i = 0; // counts the number of steps | |
1375 | ||
1376 | foreach ($dendogram_start as $lang) { | |
1377 | if (isset($dendogram_alias[$lang])) { | |
1378 | $lang_key = $dendogram_alias[$lang]; | |
1379 | } else { | |
1380 | $lang_key = $lang; | |
1381 | } | |
1382 | ||
1383 | $scores[$lang] = $this->_normalize_score( | |
1384 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
1385 | $sample_count); | |
1386 | ||
1387 | $i++; | |
1388 | } | |
1389 | ||
1390 | if ($this->_perl_compatible) { | |
1391 | asort($scores); | |
1392 | } else { | |
1393 | arsort($scores); | |
1394 | } | |
1395 | ||
1396 | $top_score = current($scores); | |
1397 | $top_key = key($scores); | |
1398 | ||
1399 | // of starting forks, $top_key is the most similar to the sample | |
1400 | ||
1401 | $cur_key = $top_key; | |
1402 | while (isset($dendogram_data[$cur_key])) { | |
1403 | $lang1 = $dendogram_data[$cur_key]['bestfit']; | |
1404 | $lang2 = $dendogram_data[$cur_key]['otherfit']; | |
1405 | foreach (array($lang1, $lang2) as $lang) { | |
1406 | if (isset($dendogram_alias[$lang])) { | |
1407 | $lang_key = $dendogram_alias[$lang]; | |
1408 | } else { | |
1409 | $lang_key = $lang; | |
1410 | } | |
1411 | ||
1412 | $scores[$lang] = $this->_normalize_score( | |
1413 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | |
1414 | $sample_count); | |
1415 | ||
1416 | //todo: does not need to do same comparison again | |
1417 | } | |
1418 | ||
1419 | $i++; | |
1420 | ||
1421 | if ($scores[$lang1] > $scores[$lang2]) { | |
1422 | $cur_key = $lang1; | |
1423 | $loser_key = $lang2; | |
1424 | } else { | |
1425 | $cur_key = $lang2; | |
1426 | $loser_key = $lang1; | |
1427 | } | |
1428 | ||
1429 | $diff = $scores[$cur_key] - $scores[$loser_key]; | |
1430 | ||
1431 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | |
1432 | // over $loser_key ({$dendogram_alias[$loser_key]}) | |
1433 | // with a difference of $diff | |
1434 | } | |
1435 | ||
1436 | // found result in $i compares | |
1437 | ||
1438 | // rather than sorting the result, preserve it so that you can see | |
1439 | // which paths the algorithm decided to take along the tree | |
1440 | ||
1441 | // but sometimes the last item is only the second highest | |
1442 | if ( ($this->_perl_compatible && (end($scores) > prev($scores))) | |
1443 | || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { | |
1444 | ||
1445 | $real_last_score = current($scores); | |
1446 | $real_last_key = key($scores); | |
1447 | ||
1448 | // swaps the 2nd-to-last item for the last item | |
1449 | unset($scores[$real_last_key]); | |
1450 | $scores[$real_last_key] = $real_last_score; | |
1451 | } | |
1452 | ||
1453 | ||
1454 | if (!$this->_perl_compatible) { | |
1455 | $scores = array_reverse($scores, true); | |
1456 | // second param requires php > 4.0.3 | |
1457 | } | |
1458 | ||
1459 | return $scores; | |
1460 | } | |
1461 | ||
1462 | /** | |
1463 | * ut8-safe strlen() | |
1464 | * | |
1465 | * Returns the numbers of characters (not bytes) in a utf8 string | |
1466 | * | |
1467 | * @static | |
1468 | * @access public | |
1469 | * @param string $str string to get the length of | |
1470 | * @return int number of chars | |
1471 | */ | |
1472 | function utf8strlen($str) | |
1473 | { | |
1474 | // utf8_decode() will convert unknown chars to '?', which is actually | |
1475 | // ideal for counting. | |
1476 | ||
1477 | return strlen(utf8_decode($str)); | |
1478 | ||
1479 | // idea stolen from dokuwiki | |
1480 | } | |
1481 | ||
1482 | /** | |
1483 | * Returns the unicode value of a utf8 char | |
1484 | * | |
1485 | * @access protected | |
1486 | * @param string $char a utf8 (possibly multi-byte) char | |
1487 | * @return int unicode value or -1 if malformatted | |
1488 | */ | |
1489 | function _utf8char2unicode($char) { | |
1490 | ||
1491 | // strlen() here will actually get the binary length of a single char | |
1492 | switch (strlen($char)) { | |
1493 | ||
1494 | // for a reference, see http://en.wikipedia.org/wiki/UTF-8 | |
1495 | ||
1496 | case 1: | |
1497 | // normal ASCII-7 byte | |
1498 | // 0xxxxxxx --> 0xxxxxxx | |
1499 | return ord($char{0}); | |
1500 | ||
1501 | case 2: | |
1502 | // 2 byte unicode | |
1503 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1504 | $z = (ord($char{0}) & 0x000001F) << 6; | |
1505 | $x = (ord($char{1}) & 0x0000003F); | |
1506 | ||
1507 | return ($z | $x); | |
1508 | ||
1509 | case 3: | |
1510 | // 3 byte unicode | |
1511 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | |
1512 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1513 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | |
1514 | $x2 = (ord($char{2}) & 0x0000003F); | |
1515 | ||
1516 | return ($z | $x1 | $x2); | |
1517 | ||
1518 | case 4: | |
1519 | // 4 byte unicode | |
1520 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1521 | // 000zzzzz xxxxxxxx xxxxxxxx | |
1522 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1523 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | |
1524 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | |
1525 | $x2 = (ord($char{3}) & 0x0000003F); | |
1526 | ||
1527 | return ($z1 | $z2 | $x1 | $x2); | |
1528 | ||
1529 | default: | |
1530 | // error: malformatted char? | |
1531 | return -1; | |
1532 | } | |
1533 | } | |
1534 | ||
1535 | /** | |
1536 | * utf8-safe fast character iterator | |
1537 | * | |
1538 | * Will get the next character starting from $counter, which will then be | |
1539 | * incremented. If a multi-byte char the bytes will be concatenated and | |
1540 | * $counter will be incremeted by the number of bytes in the char. | |
1541 | * | |
1542 | * @access private | |
1543 | * @param string &$str the string being iterated over | |
1544 | * @param int &$counter the iterator, will increment by reference | |
1545 | * @param bool $special_convert whether to do special conversions | |
1546 | * @return char the next (possibly multi-byte) char from $counter | |
1547 | */ | |
1548 | function _next_char(&$str, &$counter, $special_convert = false) | |
1549 | { | |
1550 | ||
1551 | $char = $str{$counter++}; | |
1552 | $ord = ord($char); | |
1553 | ||
1554 | // for a description of the utf8 system see | |
1555 | // http://www.phpclasses.org/browse/file/5131.html | |
1556 | ||
1557 | // normal ascii one byte char | |
1558 | if ($ord <= 127) { | |
1559 | ||
1560 | // special conversions needed for this package | |
1561 | // (that only apply to regular ascii characters) | |
1562 | // lower case, and convert all non-alphanumeric characters | |
1563 | // other than "'" to space | |
1564 | if ($special_convert && $char != ' ' && $char != "'") { | |
1565 | if ($ord >= 65 && $ord <= 90) { // A-Z | |
1566 | $char = chr($ord + 32); // lower case | |
1567 | } elseif ($ord < 97 || $ord > 122) { // NOT a-z | |
1568 | $char = ' '; // convert to space | |
1569 | } | |
1570 | } | |
1571 | ||
1572 | return $char; | |
1573 | ||
1574 | // multi-byte chars | |
1575 | } elseif ($ord >> 5 == 6) { // two-byte char | |
1576 | $nextchar = $str{$counter++}; // get next byte | |
1577 | ||
1578 | // lower-casing of non-ascii characters is still incomplete | |
1579 | ||
1580 | if ($special_convert) { | |
1581 | // lower case latin accented characters | |
1582 | if ($ord == 195) { | |
1583 | $nextord = ord($nextchar); | |
1584 | $nextord_adj = $nextord + 64; | |
1585 | // for a reference, see | |
1586 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | |
1587 | ||
1588 | // À - Þ but not × | |
1589 | if ( $nextord_adj >= 192 | |
1590 | && $nextord_adj <= 222 | |
1591 | && $nextord_adj != 215) { | |
1592 | ||
1593 | $nextchar = chr($nextord + 32); | |
1594 | } | |
1595 | ||
1596 | // lower case cyrillic alphabet | |
1597 | } elseif ($ord == 208) { | |
1598 | $nextord = ord($nextchar); | |
1599 | // if A - Pe | |
1600 | if ($nextord >= 144 && $nextord <= 159) { | |
1601 | // lower case | |
1602 | $nextchar = chr($nextord + 32); | |
1603 | ||
1604 | // if Er - Ya | |
1605 | } elseif ($nextord >= 160 && $nextord <= 175) { | |
1606 | // lower case | |
1607 | $char = chr(209); // == $ord++ | |
1608 | $nextchar = chr($nextord - 32); | |
1609 | } | |
1610 | } | |
1611 | } | |
1612 | ||
1613 | // tag on next byte | |
1614 | return $char . $nextchar; | |
1615 | ||
1616 | } elseif ($ord >> 4 == 14) { // three-byte char | |
1617 | ||
1618 | // tag on next 2 bytes | |
1619 | return $char . $str{$counter++} . $str{$counter++}; | |
1620 | ||
1621 | } elseif ($ord >> 3 == 30) { // four-byte char | |
1622 | ||
1623 | // tag on next 3 bytes | |
1624 | return $char . $str{$counter++} . $str{$counter++} . $str{$counter++}; | |
1625 | ||
1626 | } else { | |
1627 | // error? | |
1628 | } | |
1629 | } | |
1630 | ||
1631 | } | |
1632 | ||
1633 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | |
1634 | ||
1635 | ?> |