diff options
Diffstat (limited to 'inc/3rdparty/libraries/language-detect/LanguageDetect.php')
-rw-r--r-- | inc/3rdparty/libraries/language-detect/LanguageDetect.php | 1693 |
1 files changed, 0 insertions, 1693 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php deleted file mode 100644 index 382d869c..00000000 --- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php +++ /dev/null | |||
@@ -1,1693 +0,0 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Detects the language of a given piece of text. | ||
5 | * | ||
6 | * Attempts to detect the language of a sample of text by correlating ranked | ||
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | ||
8 | * | ||
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | ||
10 | * (1994): "N-Gram-Based Text Categorization" | ||
11 | * | ||
12 | * PHP version 5 | ||
13 | * | ||
14 | * @category Text | ||
15 | * @package Text_LanguageDetect | ||
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | ||
17 | * @copyright 2005-2006 Nicholas Pisarro | ||
18 | * @license http://www.debian.org/misc/bsd.license BSD | ||
19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ | ||
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
21 | * @link http://langdetect.blogspot.com/ | ||
22 | */ | ||
23 | |||
24 | require_once 'LanguageDetect/Exception.php'; | ||
25 | require_once 'LanguageDetect/Parser.php'; | ||
26 | require_once 'LanguageDetect/ISO639.php'; | ||
27 | |||
28 | /** | ||
29 | * Language detection class | ||
30 | * | ||
31 | * Requires the langauge model database (lang.dat) that should have | ||
32 | * accompanied this class definition in order to be instantiated. | ||
33 | * | ||
34 | * Example usage: | ||
35 | * | ||
36 | * <code> | ||
37 | * require_once 'Text/LanguageDetect.php'; | ||
38 | * | ||
39 | * $l = new Text_LanguageDetect; | ||
40 | * | ||
41 | * $stdin = fopen('php://stdin', 'r'); | ||
42 | * | ||
43 | * echo "Supported languages:\n"; | ||
44 | * | ||
45 | * try { | ||
46 | * $langs = $l->getLanguages(); | ||
47 | * } catch (Text_LanguageDetect_Exception $e) { | ||
48 | * die($e->getMessage()); | ||
49 | * } | ||
50 | * | ||
51 | * sort($langs); | ||
52 | * echo join(', ', $langs); | ||
53 | * | ||
54 | * while ($line = fgets($stdin)) { | ||
55 | * print_r($l->detect($line, 4)); | ||
56 | * } | ||
57 | * </code> | ||
58 | * | ||
59 | * @category Text | ||
60 | * @package Text_LanguageDetect | ||
61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | ||
62 | * @copyright 2005 Nicholas Pisarro | ||
63 | * @license http://www.debian.org/misc/bsd.license BSD | ||
64 | * @version Release: @package_version@ | ||
65 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
66 | * @todo allow users to generate their own language models | ||
67 | */ | ||
68 | class Text_LanguageDetect | ||
69 | { | ||
70 | /** | ||
71 | * The filename that stores the trigram data for the detector | ||
72 | * | ||
73 | * If this value starts with a slash (/) or a dot (.) the value of | ||
74 | * $this->_data_dir will be ignored | ||
75 | * | ||
76 | * @var string | ||
77 | * @access private | ||
78 | */ | ||
79 | var $_db_filename = 'lang.dat'; | ||
80 | |||
81 | /** | ||
82 | * The filename that stores the unicode block definitions | ||
83 | * | ||
84 | * If this value starts with a slash (/) or a dot (.) the value of | ||
85 | * $this->_data_dir will be ignored | ||
86 | * | ||
87 | * @var string | ||
88 | * @access private | ||
89 | */ | ||
90 | var $_unicode_db_filename = 'unicode_blocks.dat'; | ||
91 | |||
92 | /** | ||
93 | * The data directory | ||
94 | * | ||
95 | * Should be set by PEAR installer | ||
96 | * | ||
97 | * @var string | ||
98 | * @access private | ||
99 | */ | ||
100 | var $_data_dir = '@data_dir@'; | ||
101 | |||
102 | /** | ||
103 | * The trigram data for comparison | ||
104 | * | ||
105 | * Will be loaded on start from $this->_db_filename | ||
106 | * | ||
107 | * @var array | ||
108 | * @access private | ||
109 | */ | ||
110 | var $_lang_db = array(); | ||
111 | |||
112 | /** | ||
113 | * stores the map of the trigram data to unicode characters | ||
114 | * | ||
115 | * @access private | ||
116 | * @var array | ||
117 | */ | ||
118 | var $_unicode_map; | ||
119 | |||
120 | /** | ||
121 | * The size of the trigram data arrays | ||
122 | * | ||
123 | * @var int | ||
124 | * @access private | ||
125 | */ | ||
126 | var $_threshold = 300; | ||
127 | |||
128 | /** | ||
129 | * the maximum possible score. | ||
130 | * | ||
131 | * needed for score normalization. Different depending on the | ||
132 | * perl compatibility setting | ||
133 | * | ||
134 | * @access private | ||
135 | * @var int | ||
136 | * @see setPerlCompatible() | ||
137 | */ | ||
138 | var $_max_score = 0; | ||
139 | |||
140 | /** | ||
141 | * Whether or not to simulate perl's Language::Guess exactly | ||
142 | * | ||
143 | * @access private | ||
144 | * @var bool | ||
145 | * @see setPerlCompatible() | ||
146 | */ | ||
147 | var $_perl_compatible = false; | ||
148 | |||
149 | /** | ||
150 | * Whether to use the unicode block detection to speed up processing | ||
151 | * | ||
152 | * @access private | ||
153 | * @var bool | ||
154 | */ | ||
155 | var $_use_unicode_narrowing = true; | ||
156 | |||
157 | /** | ||
158 | * stores the result of the clustering operation | ||
159 | * | ||
160 | * @access private | ||
161 | * @var array | ||
162 | * @see clusterLanguages() | ||
163 | */ | ||
164 | var $_clusters; | ||
165 | |||
166 | /** | ||
167 | * Which type of "language names" are accepted and returned: | ||
168 | * | ||
169 | * 0 - language name ("english") | ||
170 | * 2 - 2-letter ISO 639-1 code ("en") | ||
171 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
172 | */ | ||
173 | var $_name_mode = 0; | ||
174 | |||
175 | /** | ||
176 | * Constructor | ||
177 | * | ||
178 | * Will attempt to load the language database. If it fails, you will get | ||
179 | * an exception. | ||
180 | */ | ||
181 | function __construct() | ||
182 | { | ||
183 | $data = $this->_readdb($this->_db_filename); | ||
184 | $this->_checkTrigram($data['trigram']); | ||
185 | $this->_lang_db = $data['trigram']; | ||
186 | |||
187 | if (isset($data['trigram-unicodemap'])) { | ||
188 | $this->_unicode_map = $data['trigram-unicodemap']; | ||
189 | } | ||
190 | |||
191 | // Not yet implemented: | ||
192 | if (isset($data['trigram-clusters'])) { | ||
193 | $this->_clusters = $data['trigram-clusters']; | ||
194 | } | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * Returns the path to the location of the database | ||
199 | * | ||
200 | * @param string $fname File name to load | ||
201 | * | ||
202 | * @return string expected path to the language model database | ||
203 | * @access private | ||
204 | */ | ||
205 | function _get_data_loc($fname) | ||
206 | { | ||
207 | return dirname(__FILE__).'/'.$fname; | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * Loads the language trigram database from filename | ||
212 | * | ||
213 | * Trigram datbase should be a serialize()'d array | ||
214 | * | ||
215 | * @param string $fname the filename where the data is stored | ||
216 | * | ||
217 | * @return array the language model data | ||
218 | * @throws Text_LanguageDetect_Exception | ||
219 | * @access private | ||
220 | */ | ||
221 | function _readdb($fname) | ||
222 | { | ||
223 | // finds the correct data dir | ||
224 | $fname = $this->_get_data_loc($fname); | ||
225 | |||
226 | // input check | ||
227 | if (!file_exists($fname)) { | ||
228 | throw new Text_LanguageDetect_Exception( | ||
229 | 'Language database does not exist: ' . $fname, | ||
230 | Text_LanguageDetect_Exception::DB_NOT_FOUND | ||
231 | ); | ||
232 | } elseif (!is_readable($fname)) { | ||
233 | throw new Text_LanguageDetect_Exception( | ||
234 | 'Language database is not readable: ' . $fname, | ||
235 | Text_LanguageDetect_Exception::DB_NOT_READABLE | ||
236 | ); | ||
237 | } | ||
238 | |||
239 | return unserialize(file_get_contents($fname)); | ||
240 | } | ||
241 | |||
242 | |||
243 | /** | ||
244 | * Checks if this object is ready to detect languages | ||
245 | * | ||
246 | * @param array $trigram Trigram data from database | ||
247 | * | ||
248 | * @return void | ||
249 | * @access private | ||
250 | */ | ||
251 | function _checkTrigram($trigram) | ||
252 | { | ||
253 | if (!is_array($trigram)) { | ||
254 | if (ini_get('magic_quotes_runtime')) { | ||
255 | throw new Text_LanguageDetect_Exception( | ||
256 | 'Error loading database. Try turning magic_quotes_runtime off.', | ||
257 | Text_LanguageDetect_Exception::MAGIC_QUOTES | ||
258 | ); | ||
259 | } | ||
260 | throw new Text_LanguageDetect_Exception( | ||
261 | 'Language database is not an array.', | ||
262 | Text_LanguageDetect_Exception::DB_NOT_ARRAY | ||
263 | ); | ||
264 | } elseif (empty($trigram)) { | ||
265 | throw new Text_LanguageDetect_Exception( | ||
266 | 'Language database has no elements.', | ||
267 | Text_LanguageDetect_Exception::DB_EMPTY | ||
268 | ); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | /** | ||
273 | * Omits languages | ||
274 | * | ||
275 | * Pass this function the name of or an array of names of | ||
276 | * languages that you don't want considered | ||
277 | * | ||
278 | * If you're only expecting a limited set of languages, this can greatly | ||
279 | * speed up processing | ||
280 | * | ||
281 | * @param mixed $omit_list language name or array of names to omit | ||
282 | * @param bool $include_only if true will include (rather than | ||
283 | * exclude) only those in the list | ||
284 | * | ||
285 | * @return int number of languages successfully deleted | ||
286 | * @throws Text_LanguageDetect_Exception | ||
287 | */ | ||
288 | public function omitLanguages($omit_list, $include_only = false) | ||
289 | { | ||
290 | $deleted = 0; | ||
291 | |||
292 | $omit_list = $this->_convertFromNameMode($omit_list); | ||
293 | |||
294 | if (!$include_only) { | ||
295 | // deleting the given languages | ||
296 | if (!is_array($omit_list)) { | ||
297 | $omit_list = strtolower($omit_list); // case desensitize | ||
298 | if (isset($this->_lang_db[$omit_list])) { | ||
299 | unset($this->_lang_db[$omit_list]); | ||
300 | $deleted++; | ||
301 | } | ||
302 | } else { | ||
303 | foreach ($omit_list as $omit_lang) { | ||
304 | if (isset($this->_lang_db[$omit_lang])) { | ||
305 | unset($this->_lang_db[$omit_lang]); | ||
306 | $deleted++; | ||
307 | } | ||
308 | } | ||
309 | } | ||
310 | |||
311 | } else { | ||
312 | // deleting all except the given languages | ||
313 | if (!is_array($omit_list)) { | ||
314 | $omit_list = array($omit_list); | ||
315 | } | ||
316 | |||
317 | // case desensitize | ||
318 | foreach ($omit_list as $key => $omit_lang) { | ||
319 | $omit_list[$key] = strtolower($omit_lang); | ||
320 | } | ||
321 | |||
322 | foreach (array_keys($this->_lang_db) as $lang) { | ||
323 | if (!in_array($lang, $omit_list)) { | ||
324 | unset($this->_lang_db[$lang]); | ||
325 | $deleted++; | ||
326 | } | ||
327 | } | ||
328 | } | ||
329 | |||
330 | // reset the cluster cache if the number of languages changes | ||
331 | // this will then have to be recalculated | ||
332 | if (isset($this->_clusters) && $deleted > 0) { | ||
333 | $this->_clusters = null; | ||
334 | } | ||
335 | |||
336 | return $deleted; | ||
337 | } | ||
338 | |||
339 | |||
340 | /** | ||
341 | * Returns the number of languages that this object can detect | ||
342 | * | ||
343 | * @access public | ||
344 | * @return int the number of languages | ||
345 | * @throws Text_LanguageDetect_Exception | ||
346 | */ | ||
347 | function getLanguageCount() | ||
348 | { | ||
349 | return count($this->_lang_db); | ||
350 | } | ||
351 | |||
352 | /** | ||
353 | * Checks if the language with the given name exists in the database | ||
354 | * | ||
355 | * @param mixed $lang Language name or array of language names | ||
356 | * | ||
357 | * @return bool true if language model exists | ||
358 | */ | ||
359 | public function languageExists($lang) | ||
360 | { | ||
361 | $lang = $this->_convertFromNameMode($lang); | ||
362 | |||
363 | if (is_string($lang)) { | ||
364 | return isset($this->_lang_db[strtolower($lang)]); | ||
365 | |||
366 | } elseif (is_array($lang)) { | ||
367 | foreach ($lang as $test_lang) { | ||
368 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
369 | return false; | ||
370 | } | ||
371 | } | ||
372 | return true; | ||
373 | |||
374 | } else { | ||
375 | throw new Text_LanguageDetect_Exception( | ||
376 | 'Unsupported parameter type passed to languageExists()', | ||
377 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
378 | ); | ||
379 | } | ||
380 | } | ||
381 | |||
382 | /** | ||
383 | * Returns the list of detectable languages | ||
384 | * | ||
385 | * @access public | ||
386 | * @return array the names of the languages known to this object<<<<<<< | ||
387 | * @throws Text_LanguageDetect_Exception | ||
388 | */ | ||
389 | function getLanguages() | ||
390 | { | ||
391 | return $this->_convertToNameMode( | ||
392 | array_keys($this->_lang_db) | ||
393 | ); | ||
394 | } | ||
395 | |||
396 | /** | ||
397 | * Make this object behave like Language::Guess | ||
398 | * | ||
399 | * @param bool $setting false to turn off perl compatibility | ||
400 | * | ||
401 | * @return void | ||
402 | */ | ||
403 | public function setPerlCompatible($setting = true) | ||
404 | { | ||
405 | if (is_bool($setting)) { // input check | ||
406 | $this->_perl_compatible = $setting; | ||
407 | |||
408 | if ($setting == true) { | ||
409 | $this->_max_score = $this->_threshold; | ||
410 | } else { | ||
411 | $this->_max_score = 0; | ||
412 | } | ||
413 | } | ||
414 | |||
415 | } | ||
416 | |||
417 | /** | ||
418 | * Sets the way how language names are accepted and returned. | ||
419 | * | ||
420 | * @param integer $name_mode One of the following modes: | ||
421 | * 0 - language name ("english") | ||
422 | * 2 - 2-letter ISO 639-1 code ("en") | ||
423 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
424 | * | ||
425 | * @return void | ||
426 | */ | ||
427 | function setNameMode($name_mode) | ||
428 | { | ||
429 | $this->_name_mode = $name_mode; | ||
430 | } | ||
431 | |||
432 | /** | ||
433 | * Whether to use unicode block ranges in detection | ||
434 | * | ||
435 | * Should speed up most detections if turned on (detault is on). In some | ||
436 | * circumstances it may be slower, such as for large text samples (> 10K) | ||
437 | * in languages that use latin scripts. In other cases it should speed up | ||
438 | * detection noticeably. | ||
439 | * | ||
440 | * @param bool $setting false to turn off | ||
441 | * | ||
442 | * @return void | ||
443 | */ | ||
444 | public function useUnicodeBlocks($setting = true) | ||
445 | { | ||
446 | if (is_bool($setting)) { | ||
447 | $this->_use_unicode_narrowing = $setting; | ||
448 | } | ||
449 | } | ||
450 | |||
451 | /** | ||
452 | * Converts a piece of text into trigrams | ||
453 | * | ||
454 | * @param string $text text to convert | ||
455 | * | ||
456 | * @return array array of trigram frequencies | ||
457 | * @access private | ||
458 | * @deprecated Superceded by the Text_LanguageDetect_Parser class | ||
459 | */ | ||
460 | function _trigram($text) | ||
461 | { | ||
462 | $s = new Text_LanguageDetect_Parser($text); | ||
463 | $s->prepareTrigram(); | ||
464 | $s->prepareUnicode(false); | ||
465 | $s->setPadStart(!$this->_perl_compatible); | ||
466 | $s->analyze(); | ||
467 | return $s->getTrigramFreqs(); | ||
468 | } | ||
469 | |||
470 | /** | ||
471 | * Converts a set of trigrams from frequencies to ranks | ||
472 | * | ||
473 | * Thresholds (cuts off) the list at $this->_threshold | ||
474 | * | ||
475 | * @param array $arr array of trigram | ||
476 | * | ||
477 | * @return array ranks of trigrams | ||
478 | * @access protected | ||
479 | */ | ||
480 | function _arr_rank($arr) | ||
481 | { | ||
482 | |||
483 | // sorts alphabetically first as a standard way of breaking rank ties | ||
484 | $this->_bub_sort($arr); | ||
485 | |||
486 | // below might also work, but seemed to introduce errors in testing | ||
487 | //ksort($arr); | ||
488 | //asort($arr); | ||
489 | |||
490 | $rank = array(); | ||
491 | |||
492 | $i = 0; | ||
493 | foreach ($arr as $key => $value) { | ||
494 | $rank[$key] = $i++; | ||
495 | |||
496 | // cut off at a standard threshold | ||
497 | if ($i >= $this->_threshold) { | ||
498 | break; | ||
499 | } | ||
500 | } | ||
501 | |||
502 | return $rank; | ||
503 | } | ||
504 | |||
505 | /** | ||
506 | * Sorts an array by value breaking ties alphabetically | ||
507 | * | ||
508 | * @param array &$arr the array to sort | ||
509 | * | ||
510 | * @return void | ||
511 | * @access private | ||
512 | */ | ||
513 | function _bub_sort(&$arr) | ||
514 | { | ||
515 | // should do the same as this perl statement: | ||
516 | // sort { $trigrams{$b} == $trigrams{$a} | ||
517 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | ||
518 | |||
519 | // needs to sort by both key and value at once | ||
520 | // using the key to break ties for the value | ||
521 | |||
522 | // converts array into an array of arrays of each key and value | ||
523 | // may be a better way of doing this | ||
524 | $combined = array(); | ||
525 | |||
526 | foreach ($arr as $key => $value) { | ||
527 | $combined[] = array($key, $value); | ||
528 | } | ||
529 | |||
530 | usort($combined, array($this, '_sort_func')); | ||
531 | |||
532 | $replacement = array(); | ||
533 | foreach ($combined as $key => $value) { | ||
534 | list($new_key, $new_value) = $value; | ||
535 | $replacement[$new_key] = $new_value; | ||
536 | } | ||
537 | |||
538 | $arr = $replacement; | ||
539 | } | ||
540 | |||
541 | /** | ||
542 | * Sort function used by bubble sort | ||
543 | * | ||
544 | * Callback function for usort(). | ||
545 | * | ||
546 | * @param array $a first param passed by usort() | ||
547 | * @param array $b second param passed by usort() | ||
548 | * | ||
549 | * @return int 1 if $a is greater, -1 if not | ||
550 | * @see _bub_sort() | ||
551 | * @access private | ||
552 | */ | ||
553 | function _sort_func($a, $b) | ||
554 | { | ||
555 | // each is actually a key/value pair, so that it can compare using both | ||
556 | list($a_key, $a_value) = $a; | ||
557 | list($b_key, $b_value) = $b; | ||
558 | |||
559 | if ($a_value == $b_value) { | ||
560 | // if the values are the same, break ties using the key | ||
561 | return strcmp($a_key, $b_key); | ||
562 | |||
563 | } else { | ||
564 | // if not, just sort normally | ||
565 | if ($a_value > $b_value) { | ||
566 | return -1; | ||
567 | } else { | ||
568 | return 1; | ||
569 | } | ||
570 | } | ||
571 | |||
572 | // 0 should not be possible because keys must be unique | ||
573 | } | ||
574 | |||
575 | /** | ||
576 | * Calculates a linear rank-order distance statistic between two sets of | ||
577 | * ranked trigrams | ||
578 | * | ||
579 | * Sums the differences in rank for each trigram. If the trigram does not | ||
580 | * appear in both, consider it a difference of $this->_threshold. | ||
581 | * | ||
582 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | ||
583 | * its simplicity it has been shown to be highly accurate for language | ||
584 | * identification tasks. | ||
585 | * | ||
586 | * @param array $arr1 the reference set of trigram ranks | ||
587 | * @param array $arr2 the target set of trigram ranks | ||
588 | * | ||
589 | * @return int the sum of the differences between the ranks of | ||
590 | * the two trigram sets | ||
591 | * @access private | ||
592 | */ | ||
593 | function _distance($arr1, $arr2) | ||
594 | { | ||
595 | $sumdist = 0; | ||
596 | |||
597 | foreach ($arr2 as $key => $value) { | ||
598 | if (isset($arr1[$key])) { | ||
599 | $distance = abs($value - $arr1[$key]); | ||
600 | } else { | ||
601 | // $this->_threshold sets the maximum possible distance value | ||
602 | // for any one pair of trigrams | ||
603 | $distance = $this->_threshold; | ||
604 | } | ||
605 | $sumdist += $distance; | ||
606 | } | ||
607 | |||
608 | return $sumdist; | ||
609 | |||
610 | // todo: there are other distance statistics to try, e.g. relative | ||
611 | // entropy, but they're probably more costly to compute | ||
612 | } | ||
613 | |||
614 | /** | ||
615 | * Normalizes the score returned by _distance() | ||
616 | * | ||
617 | * Different if perl compatible or not | ||
618 | * | ||
619 | * @param int $score the score from _distance() | ||
620 | * @param int $base_count the number of trigrams being considered | ||
621 | * | ||
622 | * @return float the normalized score | ||
623 | * @see _distance() | ||
624 | * @access private | ||
625 | */ | ||
626 | function _normalize_score($score, $base_count = null) | ||
627 | { | ||
628 | if ($base_count === null) { | ||
629 | $base_count = $this->_threshold; | ||
630 | } | ||
631 | |||
632 | if (!$this->_perl_compatible) { | ||
633 | return 1 - ($score / $base_count / $this->_threshold); | ||
634 | } else { | ||
635 | return floor($score / $base_count); | ||
636 | } | ||
637 | } | ||
638 | |||
639 | |||
640 | /** | ||
641 | * Detects the closeness of a sample of text to the known languages | ||
642 | * | ||
643 | * Calculates the statistical difference between the text and | ||
644 | * the trigrams for each language, normalizes the score then | ||
645 | * returns results for all languages in sorted order | ||
646 | * | ||
647 | * If perl compatible, the score is 300-0, 0 being most similar. | ||
648 | * Otherwise, it's 0-1 with 1 being most similar. | ||
649 | * | ||
650 | * The $sample text should be at least a few sentences in length; | ||
651 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | ||
652 | * is present it will try to detect and convert. However, experience has | ||
653 | * shown that mb_detect_encoding() *does not work very well* with at least | ||
654 | * some types of encoding. | ||
655 | * | ||
656 | * @param string $sample a sample of text to compare. | ||
657 | * @param int $limit if specified, return an array of the most likely | ||
658 | * $limit languages and their scores. | ||
659 | * | ||
660 | * @return mixed sorted array of language scores, blank array if no | ||
661 | * useable text was found | ||
662 | * @see _distance() | ||
663 | * @throws Text_LanguageDetect_Exception | ||
664 | */ | ||
665 | public function detect($sample, $limit = 0) | ||
666 | { | ||
667 | // input check | ||
668 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | ||
669 | return array(); | ||
670 | } | ||
671 | |||
672 | // check char encoding | ||
673 | // (only if mbstring extension is compiled and PHP > 4.0.6) | ||
674 | if (function_exists('mb_detect_encoding') | ||
675 | && function_exists('mb_convert_encoding') | ||
676 | ) { | ||
677 | // mb_detect_encoding isn't very reliable, to say the least | ||
678 | // detection should still work with a sufficient sample | ||
679 | // of ascii characters | ||
680 | $encoding = mb_detect_encoding($sample); | ||
681 | |||
682 | // mb_detect_encoding() will return FALSE if detection fails | ||
683 | // don't attempt conversion if that's the case | ||
684 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' | ||
685 | && $encoding !== false | ||
686 | ) { | ||
687 | // verify the encoding exists in mb_list_encodings | ||
688 | if (in_array($encoding, mb_list_encodings())) { | ||
689 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
690 | } | ||
691 | } | ||
692 | } | ||
693 | |||
694 | $sample_obj = new Text_LanguageDetect_Parser($sample); | ||
695 | $sample_obj->prepareTrigram(); | ||
696 | if ($this->_use_unicode_narrowing) { | ||
697 | $sample_obj->prepareUnicode(); | ||
698 | } | ||
699 | $sample_obj->setPadStart(!$this->_perl_compatible); | ||
700 | $sample_obj->analyze(); | ||
701 | |||
702 | $trigram_freqs =& $sample_obj->getTrigramRanks(); | ||
703 | $trigram_count = count($trigram_freqs); | ||
704 | |||
705 | if ($trigram_count == 0) { | ||
706 | return array(); | ||
707 | } | ||
708 | |||
709 | $scores = array(); | ||
710 | |||
711 | // use unicode block detection to narrow down the possibilities | ||
712 | if ($this->_use_unicode_narrowing) { | ||
713 | $blocks =& $sample_obj->getUnicodeBlocks(); | ||
714 | |||
715 | if (is_array($blocks)) { | ||
716 | $present_blocks = array_keys($blocks); | ||
717 | } else { | ||
718 | throw new Text_LanguageDetect_Exception( | ||
719 | 'Error during block detection', | ||
720 | Text_LanguageDetect_Exception::BLOCK_DETECTION | ||
721 | ); | ||
722 | } | ||
723 | |||
724 | $possible_langs = array(); | ||
725 | |||
726 | foreach ($present_blocks as $blockname) { | ||
727 | if (isset($this->_unicode_map[$blockname])) { | ||
728 | |||
729 | $possible_langs = array_merge( | ||
730 | $possible_langs, | ||
731 | array_keys($this->_unicode_map[$blockname]) | ||
732 | ); | ||
733 | |||
734 | // todo: faster way to do this? | ||
735 | } | ||
736 | } | ||
737 | |||
738 | // could also try an intersect operation rather than a union | ||
739 | // in other words, choose languages whose trigrams contain | ||
740 | // ALL of the unicode blocks found in this sample | ||
741 | // would improve speed but would be completely thrown off by an | ||
742 | // unexpected character, like an umlaut appearing in english text | ||
743 | |||
744 | $possible_langs = array_intersect( | ||
745 | array_keys($this->_lang_db), | ||
746 | array_unique($possible_langs) | ||
747 | ); | ||
748 | |||
749 | // needs to intersect it with the keys of _lang_db in case | ||
750 | // languages have been omitted | ||
751 | |||
752 | } else { | ||
753 | // or just try 'em all | ||
754 | $possible_langs = array_keys($this->_lang_db); | ||
755 | } | ||
756 | |||
757 | |||
758 | foreach ($possible_langs as $lang) { | ||
759 | $scores[$lang] = $this->_normalize_score( | ||
760 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | ||
761 | $trigram_count | ||
762 | ); | ||
763 | } | ||
764 | |||
765 | unset($sample_obj); | ||
766 | |||
767 | if ($this->_perl_compatible) { | ||
768 | asort($scores); | ||
769 | } else { | ||
770 | arsort($scores); | ||
771 | } | ||
772 | |||
773 | // todo: drop languages with a score of $this->_max_score? | ||
774 | |||
775 | // limit the number of returned scores | ||
776 | if ($limit && is_numeric($limit)) { | ||
777 | $limited_scores = array(); | ||
778 | |||
779 | $i = 0; | ||
780 | foreach ($scores as $key => $value) { | ||
781 | if ($i++ >= $limit) { | ||
782 | break; | ||
783 | } | ||
784 | |||
785 | $limited_scores[$key] = $value; | ||
786 | } | ||
787 | |||
788 | return $this->_convertToNameMode($limited_scores, true); | ||
789 | } else { | ||
790 | return $this->_convertToNameMode($scores, true); | ||
791 | } | ||
792 | } | ||
793 | |||
794 | /** | ||
795 | * Returns only the most similar language to the text sample | ||
796 | * | ||
797 | * Calls $this->detect() and returns only the top result | ||
798 | * | ||
799 | * @param string $sample text to detect the language of | ||
800 | * | ||
801 | * @return string the name of the most likely language | ||
802 | * or null if no language is similar | ||
803 | * @see detect() | ||
804 | * @throws Text_LanguageDetect_Exception | ||
805 | */ | ||
806 | public function detectSimple($sample) | ||
807 | { | ||
808 | $scores = $this->detect($sample, 1); | ||
809 | |||
810 | // if top language has the maximum possible score, | ||
811 | // then the top score will have been picked at random | ||
812 | if (!is_array($scores) || empty($scores) | ||
813 | || current($scores) == $this->_max_score | ||
814 | ) { | ||
815 | return null; | ||
816 | } else { | ||
817 | return key($scores); | ||
818 | } | ||
819 | } | ||
820 | |||
821 | /** | ||
822 | * Returns an array containing the most similar language and a confidence | ||
823 | * rating | ||
824 | * | ||
825 | * Confidence is a simple measure calculated from the similarity score | ||
826 | * minus the similarity score from the next most similar language | ||
827 | * divided by the highest possible score. Languages that have closely | ||
828 | * related cousins (e.g. Norwegian and Danish) should generally have lower | ||
829 | * confidence scores. | ||
830 | * | ||
831 | * The similarity score answers the question "How likely is the text the | ||
832 | * returned language regardless of the other languages considered?" The | ||
833 | * confidence score is one way of answering the question "how likely is the | ||
834 | * text the detected language relative to the rest of the language model | ||
835 | * set?" | ||
836 | * | ||
837 | * To see how similar languages are a priori, see languageSimilarity() | ||
838 | * | ||
839 | * @param string $sample text for which language will be detected | ||
840 | * | ||
841 | * @return array most similar language, score and confidence rating | ||
842 | * or null if no language is similar | ||
843 | * @see detect() | ||
844 | * @throws Text_LanguageDetect_Exception | ||
845 | */ | ||
846 | public function detectConfidence($sample) | ||
847 | { | ||
848 | $scores = $this->detect($sample, 2); | ||
849 | |||
850 | // if most similar language has the max score, it | ||
851 | // will have been picked at random | ||
852 | if (!is_array($scores) || empty($scores) | ||
853 | || current($scores) == $this->_max_score | ||
854 | ) { | ||
855 | return null; | ||
856 | } | ||
857 | |||
858 | $arr['language'] = key($scores); | ||
859 | $arr['similarity'] = current($scores); | ||
860 | if (next($scores) !== false) { // if false then no next element | ||
861 | // the goal is to return a higher value if the distance between | ||
862 | // the similarity of the first score and the second score is high | ||
863 | |||
864 | if ($this->_perl_compatible) { | ||
865 | $arr['confidence'] = (current($scores) - $arr['similarity']) | ||
866 | / $this->_max_score; | ||
867 | |||
868 | } else { | ||
869 | $arr['confidence'] = $arr['similarity'] - current($scores); | ||
870 | |||
871 | } | ||
872 | |||
873 | } else { | ||
874 | $arr['confidence'] = null; | ||
875 | } | ||
876 | |||
877 | return $arr; | ||
878 | } | ||
879 | |||
880 | /** | ||
881 | * Returns the distribution of unicode blocks in a given utf8 string | ||
882 | * | ||
883 | * For the block name of a single char, use unicodeBlockName() | ||
884 | * | ||
885 | * @param string $str input string. Must be ascii or utf8 | ||
886 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | ||
887 | * non-printing characters. Includes spaces, | ||
888 | * newlines and common punctutation characters. | ||
889 | * | ||
890 | * @return array | ||
891 | * @throws Text_LanguageDetect_Exception | ||
892 | */ | ||
893 | public function detectUnicodeBlocks($str, $skip_symbols) | ||
894 | { | ||
895 | $skip_symbols = (bool)$skip_symbols; | ||
896 | $str = (string)$str; | ||
897 | |||
898 | $sample_obj = new Text_LanguageDetect_Parser($str); | ||
899 | $sample_obj->prepareUnicode(); | ||
900 | $sample_obj->prepareTrigram(false); | ||
901 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | ||
902 | $sample_obj->analyze(); | ||
903 | $blocks = $sample_obj->getUnicodeBlocks(); | ||
904 | unset($sample_obj); | ||
905 | return $blocks; | ||
906 | } | ||
907 | |||
908 | /** | ||
909 | * Returns the block name for a given unicode value | ||
910 | * | ||
911 | * If passed a string, will assume it is being passed a UTF8-formatted | ||
912 | * character and will automatically convert. Otherwise it will assume it | ||
913 | * is being passed a numeric unicode value. | ||
914 | * | ||
915 | * Make sure input is of the correct type! | ||
916 | * | ||
917 | * @param mixed $unicode unicode value or utf8 char | ||
918 | * | ||
919 | * @return mixed the block name string or false if not found | ||
920 | * @throws Text_LanguageDetect_Exception | ||
921 | */ | ||
922 | public function unicodeBlockName($unicode) | ||
923 | { | ||
924 | if (is_string($unicode)) { | ||
925 | // assume it is being passed a utf8 char, so convert it | ||
926 | if (self::utf8strlen($unicode) > 1) { | ||
927 | throw new Text_LanguageDetect_Exception( | ||
928 | 'Pass a single char only to this method', | ||
929 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
930 | ); | ||
931 | } | ||
932 | $unicode = $this->_utf8char2unicode($unicode); | ||
933 | |||
934 | } elseif (!is_int($unicode)) { | ||
935 | throw new Text_LanguageDetect_Exception( | ||
936 | 'Input must be of type string or int.', | ||
937 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
938 | ); | ||
939 | } | ||
940 | |||
941 | $blocks = $this->_read_unicode_block_db(); | ||
942 | |||
943 | $result = $this->_unicode_block_name($unicode, $blocks); | ||
944 | |||
945 | if ($result == -1) { | ||
946 | return false; | ||
947 | } else { | ||
948 | return $result[2]; | ||
949 | } | ||
950 | } | ||
951 | |||
952 | /** | ||
953 | * Searches the unicode block database | ||
954 | * | ||
955 | * Returns the block name for a given unicode value. unicodeBlockName() is | ||
956 | * the public interface for this function, which does input checks which | ||
957 | * this function omits for speed. | ||
958 | * | ||
959 | * @param int $unicode the unicode value | ||
960 | * @param array $blocks the block database | ||
961 | * @param int $block_count the number of defined blocks in the database | ||
962 | * | ||
963 | * @return mixed Block name, -1 if it failed | ||
964 | * @see unicodeBlockName() | ||
965 | * @access protected | ||
966 | */ | ||
967 | function _unicode_block_name($unicode, $blocks, $block_count = -1) | ||
968 | { | ||
969 | // for a reference, see | ||
970 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | ||
971 | |||
972 | // assume that ascii characters are the most common | ||
973 | // so try it first for efficiency | ||
974 | if ($unicode <= $blocks[0][1]) { | ||
975 | return $blocks[0]; | ||
976 | } | ||
977 | |||
978 | // the optional $block_count param is for efficiency | ||
979 | // so we this function doesn't have to run count() every time | ||
980 | if ($block_count != -1) { | ||
981 | $high = $block_count - 1; | ||
982 | } else { | ||
983 | $high = count($blocks) - 1; | ||
984 | } | ||
985 | |||
986 | $low = 1; // start with 1 because ascii was 0 | ||
987 | |||
988 | // your average binary search algorithm | ||
989 | while ($low <= $high) { | ||
990 | $mid = floor(($low + $high) / 2); | ||
991 | |||
992 | if ($unicode < $blocks[$mid][0]) { | ||
993 | // if it's lower than the lower bound | ||
994 | $high = $mid - 1; | ||
995 | |||
996 | } elseif ($unicode > $blocks[$mid][1]) { | ||
997 | // if it's higher than the upper bound | ||
998 | $low = $mid + 1; | ||
999 | |||
1000 | } else { | ||
1001 | // found it | ||
1002 | return $blocks[$mid]; | ||
1003 | } | ||
1004 | } | ||
1005 | |||
1006 | // failed to find the block | ||
1007 | return -1; | ||
1008 | |||
1009 | // todo: differentiate when it's out of range or when it falls | ||
1010 | // into an unassigned range? | ||
1011 | } | ||
1012 | |||
1013 | /** | ||
1014 | * Brings up the unicode block database | ||
1015 | * | ||
1016 | * @return array the database of unicode block definitions | ||
1017 | * @throws Text_LanguageDetect_Exception | ||
1018 | * @access protected | ||
1019 | */ | ||
1020 | function _read_unicode_block_db() | ||
1021 | { | ||
1022 | // since the unicode definitions are always going to be the same, | ||
1023 | // might as well share the memory for the db with all other instances | ||
1024 | // of this class | ||
1025 | static $data; | ||
1026 | |||
1027 | if (!isset($data)) { | ||
1028 | $data = $this->_readdb($this->_unicode_db_filename); | ||
1029 | } | ||
1030 | |||
1031 | return $data; | ||
1032 | } | ||
1033 | |||
1034 | /** | ||
1035 | * Calculate the similarities between the language models | ||
1036 | * | ||
1037 | * Use this function to see how similar languages are to each other. | ||
1038 | * | ||
1039 | * If passed 2 language names, will return just those languages compared. | ||
1040 | * If passed 1 language name, will return that language compared to | ||
1041 | * all others. | ||
1042 | * If passed none, will return an array of every language model compared | ||
1043 | * to every other one. | ||
1044 | * | ||
1045 | * @param string $lang1 the name of the first language to be compared | ||
1046 | * @param string $lang2 the name of the second language to be compared | ||
1047 | * | ||
1048 | * @return array scores of every language compared | ||
1049 | * or the score of just the provided languages | ||
1050 | * or null if one of the supplied languages does not exist | ||
1051 | * @throws Text_LanguageDetect_Exception | ||
1052 | */ | ||
1053 | public function languageSimilarity($lang1 = null, $lang2 = null) | ||
1054 | { | ||
1055 | $lang1 = $this->_convertFromNameMode($lang1); | ||
1056 | $lang2 = $this->_convertFromNameMode($lang2); | ||
1057 | if ($lang1 != null) { | ||
1058 | $lang1 = strtolower($lang1); | ||
1059 | |||
1060 | // check if language model exists | ||
1061 | if (!isset($this->_lang_db[$lang1])) { | ||
1062 | return null; | ||
1063 | } | ||
1064 | |||
1065 | if ($lang2 != null) { | ||
1066 | if (!isset($this->_lang_db[$lang2])) { | ||
1067 | // check if language model exists | ||
1068 | return null; | ||
1069 | } | ||
1070 | |||
1071 | $lang2 = strtolower($lang2); | ||
1072 | |||
1073 | // compare just these two languages | ||
1074 | return $this->_normalize_score( | ||
1075 | $this->_distance( | ||
1076 | $this->_lang_db[$lang1], | ||
1077 | $this->_lang_db[$lang2] | ||
1078 | ) | ||
1079 | ); | ||
1080 | |||
1081 | } else { | ||
1082 | // compare just $lang1 to all languages | ||
1083 | $return_arr = array(); | ||
1084 | foreach ($this->_lang_db as $key => $value) { | ||
1085 | if ($key != $lang1) { | ||
1086 | // don't compare a language to itself | ||
1087 | $return_arr[$key] = $this->_normalize_score( | ||
1088 | $this->_distance($this->_lang_db[$lang1], $value) | ||
1089 | ); | ||
1090 | } | ||
1091 | } | ||
1092 | asort($return_arr); | ||
1093 | |||
1094 | return $return_arr; | ||
1095 | } | ||
1096 | |||
1097 | |||
1098 | } else { | ||
1099 | // compare all languages to each other | ||
1100 | $return_arr = array(); | ||
1101 | foreach (array_keys($this->_lang_db) as $lang1) { | ||
1102 | foreach (array_keys($this->_lang_db) as $lang2) { | ||
1103 | // skip comparing languages to themselves | ||
1104 | if ($lang1 != $lang2) { | ||
1105 | |||
1106 | if (isset($return_arr[$lang2][$lang1])) { | ||
1107 | // don't re-calculate what's already been done | ||
1108 | $return_arr[$lang1][$lang2] | ||
1109 | = $return_arr[$lang2][$lang1]; | ||
1110 | |||
1111 | } else { | ||
1112 | // calculate | ||
1113 | $return_arr[$lang1][$lang2] | ||
1114 | = $this->_normalize_score( | ||
1115 | $this->_distance( | ||
1116 | $this->_lang_db[$lang1], | ||
1117 | $this->_lang_db[$lang2] | ||
1118 | ) | ||
1119 | ); | ||
1120 | |||
1121 | } | ||
1122 | } | ||
1123 | } | ||
1124 | } | ||
1125 | return $return_arr; | ||
1126 | } | ||
1127 | } | ||
1128 | |||
1129 | /** | ||
1130 | * Cluster known languages according to languageSimilarity() | ||
1131 | * | ||
1132 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | ||
1133 | * use, and it may disappear or its functionality may change in future | ||
1134 | * releases without notice. | ||
1135 | * | ||
1136 | * Uses a nearest neighbor technique to generate the maximum possible | ||
1137 | * number of dendograms from the similarity data. | ||
1138 | * | ||
1139 | * @access public | ||
1140 | * @return array language cluster data | ||
1141 | * @throws Text_LanguageDetect_Exception | ||
1142 | * @see languageSimilarity() | ||
1143 | * @deprecated this function will eventually be removed and placed into | ||
1144 | * the model generation class | ||
1145 | */ | ||
1146 | function clusterLanguages() | ||
1147 | { | ||
1148 | // todo: set the maximum number of clusters | ||
1149 | // return cached result, if any | ||
1150 | if (isset($this->_clusters)) { | ||
1151 | return $this->_clusters; | ||
1152 | } | ||
1153 | |||
1154 | $langs = array_keys($this->_lang_db); | ||
1155 | |||
1156 | $arr = $this->languageSimilarity(); | ||
1157 | |||
1158 | sort($langs); | ||
1159 | |||
1160 | foreach ($langs as $lang) { | ||
1161 | if (!isset($this->_lang_db[$lang])) { | ||
1162 | throw new Text_LanguageDetect_Exception( | ||
1163 | "missing $lang!", | ||
1164 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | ||
1165 | ); | ||
1166 | } | ||
1167 | } | ||
1168 | |||
1169 | // http://www.psychstat.missouristate.edu/multibook/mlt04m.html | ||
1170 | foreach ($langs as $old_key => $lang1) { | ||
1171 | $langs[$lang1] = $lang1; | ||
1172 | unset($langs[$old_key]); | ||
1173 | } | ||
1174 | |||
1175 | $result_data = $really_map = array(); | ||
1176 | |||
1177 | $i = 0; | ||
1178 | while (count($langs) > 2 && $i++ < 200) { | ||
1179 | $highest_score = -1; | ||
1180 | $highest_key1 = ''; | ||
1181 | $highest_key2 = ''; | ||
1182 | foreach ($langs as $lang1) { | ||
1183 | foreach ($langs as $lang2) { | ||
1184 | if ($lang1 != $lang2 | ||
1185 | && $arr[$lang1][$lang2] > $highest_score | ||
1186 | ) { | ||
1187 | $highest_score = $arr[$lang1][$lang2]; | ||
1188 | $highest_key1 = $lang1; | ||
1189 | $highest_key2 = $lang2; | ||
1190 | } | ||
1191 | } | ||
1192 | } | ||
1193 | |||
1194 | if (!$highest_key1) { | ||
1195 | // should not ever happen | ||
1196 | throw new Text_LanguageDetect_Exception( | ||
1197 | "no highest key? (step: $i)", | ||
1198 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | ||
1199 | ); | ||
1200 | } | ||
1201 | |||
1202 | if ($highest_score == 0) { | ||
1203 | // languages are perfectly dissimilar | ||
1204 | break; | ||
1205 | } | ||
1206 | |||
1207 | // $highest_key1 and $highest_key2 are most similar | ||
1208 | $sum1 = array_sum($arr[$highest_key1]); | ||
1209 | $sum2 = array_sum($arr[$highest_key2]); | ||
1210 | |||
1211 | // use the score for the one that is most similar to the rest of | ||
1212 | // the field as the score for the group | ||
1213 | // todo: could try averaging or "centroid" method instead | ||
1214 | // seems like that might make more sense | ||
1215 | // actually nearest neighbor may be better for binary searching | ||
1216 | |||
1217 | |||
1218 | // for "Complete Linkage"/"furthest neighbor" | ||
1219 | // sign should be < | ||
1220 | // for "Single Linkage"/"nearest neighbor" method | ||
1221 | // should should be > | ||
1222 | // results seem to be pretty much the same with either method | ||
1223 | |||
1224 | // figure out which to delete and which to replace | ||
1225 | if ($sum1 > $sum2) { | ||
1226 | $replaceme = $highest_key1; | ||
1227 | $deleteme = $highest_key2; | ||
1228 | } else { | ||
1229 | $replaceme = $highest_key2; | ||
1230 | $deleteme = $highest_key1; | ||
1231 | } | ||
1232 | |||
1233 | $newkey = $replaceme . ':' . $deleteme; | ||
1234 | |||
1235 | // $replaceme is most similar to remaining languages | ||
1236 | // replace $replaceme with '$newkey', deleting $deleteme | ||
1237 | |||
1238 | // keep a record of which fork is really which language | ||
1239 | $really_lang = $replaceme; | ||
1240 | while (isset($really_map[$really_lang])) { | ||
1241 | $really_lang = $really_map[$really_lang]; | ||
1242 | } | ||
1243 | $really_map[$newkey] = $really_lang; | ||
1244 | |||
1245 | |||
1246 | // replace the best fitting key, delete the other | ||
1247 | foreach ($arr as $key1 => $arr2) { | ||
1248 | foreach ($arr2 as $key2 => $value2) { | ||
1249 | if ($key2 == $replaceme) { | ||
1250 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | ||
1251 | unset($arr[$key1][$key2]); | ||
1252 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | ||
1253 | } | ||
1254 | |||
1255 | if ($key1 == $replaceme) { | ||
1256 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | ||
1257 | unset($arr[$key1][$key2]); | ||
1258 | // replacing $arr[$key1][$key2] with $arr[$newkey][$key2] | ||
1259 | } | ||
1260 | |||
1261 | if ($key1 == $deleteme || $key2 == $deleteme) { | ||
1262 | // deleting $arr[$key1][$key2] | ||
1263 | unset($arr[$key1][$key2]); | ||
1264 | } | ||
1265 | } | ||
1266 | } | ||
1267 | |||
1268 | |||
1269 | unset($langs[$highest_key1]); | ||
1270 | unset($langs[$highest_key2]); | ||
1271 | $langs[$newkey] = $newkey; | ||
1272 | |||
1273 | |||
1274 | // some of these may be overkill | ||
1275 | $result_data[$newkey] = array( | ||
1276 | 'newkey' => $newkey, | ||
1277 | 'count' => $i, | ||
1278 | 'diff' => abs($sum1 - $sum2), | ||
1279 | 'score' => $highest_score, | ||
1280 | 'bestfit' => $replaceme, | ||
1281 | 'otherfit' => $deleteme, | ||
1282 | 'really' => $really_lang, | ||
1283 | ); | ||
1284 | } | ||
1285 | |||
1286 | $return_val = array( | ||
1287 | 'open_forks' => $langs, | ||
1288 | // the top level of clusters | ||
1289 | // clusters that are mutually exclusive | ||
1290 | // or specified by a specific maximum | ||
1291 | |||
1292 | 'fork_data' => $result_data, | ||
1293 | // data for each split | ||
1294 | |||
1295 | 'name_map' => $really_map, | ||
1296 | // which cluster is really which language | ||
1297 | // using the nearest neighbor technique, the cluster | ||
1298 | // inherits all of the properties of its most-similar member | ||
1299 | // this keeps track | ||
1300 | ); | ||
1301 | |||
1302 | |||
1303 | // saves the result in the object | ||
1304 | $this->_clusters = $return_val; | ||
1305 | |||
1306 | return $return_val; | ||
1307 | } | ||
1308 | |||
1309 | |||
1310 | /** | ||
1311 | * Perform an intelligent detection based on clusterLanguages() | ||
1312 | * | ||
1313 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | ||
1314 | * use, and it may disappear or its functionality may change in future | ||
1315 | * releases without notice. | ||
1316 | * | ||
1317 | * This compares the sample text to top the top level of clusters. If the | ||
1318 | * sample is similar to the cluster it will drop down and compare it to the | ||
1319 | * languages in the cluster, and so on until it hits a leaf node. | ||
1320 | * | ||
1321 | * this should find the language in considerably fewer compares | ||
1322 | * (the equivalent of a binary search), however clusterLanguages() is costly | ||
1323 | * and the loss of accuracy from this technique is significant. | ||
1324 | * | ||
1325 | * This method may need to be 'fuzzier' in order to become more accurate. | ||
1326 | * | ||
1327 | * This function could be more useful if the universe of possible languages | ||
1328 | * was very large, however in such cases some method of Bayesian inference | ||
1329 | * might be more helpful. | ||
1330 | * | ||
1331 | * @param string $str input string | ||
1332 | * | ||
1333 | * @return array language scores (only those compared) | ||
1334 | * @throws Text_LanguageDetect_Exception | ||
1335 | * @see clusterLanguages() | ||
1336 | */ | ||
1337 | public function clusteredSearch($str) | ||
1338 | { | ||
1339 | // input check | ||
1340 | if (!Text_LanguageDetect_Parser::validateString($str)) { | ||
1341 | return array(); | ||
1342 | } | ||
1343 | |||
1344 | // clusterLanguages() will return a cached result if possible | ||
1345 | // so it's safe to call it every time | ||
1346 | $result = $this->clusterLanguages(); | ||
1347 | |||
1348 | $dendogram_start = $result['open_forks']; | ||
1349 | $dendogram_data = $result['fork_data']; | ||
1350 | $dendogram_alias = $result['name_map']; | ||
1351 | |||
1352 | $sample_obj = new Text_LanguageDetect_Parser($str); | ||
1353 | $sample_obj->prepareTrigram(); | ||
1354 | $sample_obj->setPadStart(!$this->_perl_compatible); | ||
1355 | $sample_obj->analyze(); | ||
1356 | $sample_result = $sample_obj->getTrigramRanks(); | ||
1357 | $sample_count = count($sample_result); | ||
1358 | |||
1359 | // input check | ||
1360 | if ($sample_count == 0) { | ||
1361 | return array(); | ||
1362 | } | ||
1363 | |||
1364 | $i = 0; // counts the number of steps | ||
1365 | |||
1366 | foreach ($dendogram_start as $lang) { | ||
1367 | if (isset($dendogram_alias[$lang])) { | ||
1368 | $lang_key = $dendogram_alias[$lang]; | ||
1369 | } else { | ||
1370 | $lang_key = $lang; | ||
1371 | } | ||
1372 | |||
1373 | $scores[$lang] = $this->_normalize_score( | ||
1374 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | ||
1375 | $sample_count | ||
1376 | ); | ||
1377 | |||
1378 | $i++; | ||
1379 | } | ||
1380 | |||
1381 | if ($this->_perl_compatible) { | ||
1382 | asort($scores); | ||
1383 | } else { | ||
1384 | arsort($scores); | ||
1385 | } | ||
1386 | |||
1387 | $top_score = current($scores); | ||
1388 | $top_key = key($scores); | ||
1389 | |||
1390 | // of starting forks, $top_key is the most similar to the sample | ||
1391 | |||
1392 | $cur_key = $top_key; | ||
1393 | while (isset($dendogram_data[$cur_key])) { | ||
1394 | $lang1 = $dendogram_data[$cur_key]['bestfit']; | ||
1395 | $lang2 = $dendogram_data[$cur_key]['otherfit']; | ||
1396 | foreach (array($lang1, $lang2) as $lang) { | ||
1397 | if (isset($dendogram_alias[$lang])) { | ||
1398 | $lang_key = $dendogram_alias[$lang]; | ||
1399 | } else { | ||
1400 | $lang_key = $lang; | ||
1401 | } | ||
1402 | |||
1403 | $scores[$lang] = $this->_normalize_score( | ||
1404 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | ||
1405 | $sample_count | ||
1406 | ); | ||
1407 | |||
1408 | //todo: does not need to do same comparison again | ||
1409 | } | ||
1410 | |||
1411 | $i++; | ||
1412 | |||
1413 | if ($scores[$lang1] > $scores[$lang2]) { | ||
1414 | $cur_key = $lang1; | ||
1415 | $loser_key = $lang2; | ||
1416 | } else { | ||
1417 | $cur_key = $lang2; | ||
1418 | $loser_key = $lang1; | ||
1419 | } | ||
1420 | |||
1421 | $diff = $scores[$cur_key] - $scores[$loser_key]; | ||
1422 | |||
1423 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | ||
1424 | // over $loser_key ({$dendogram_alias[$loser_key]}) | ||
1425 | // with a difference of $diff | ||
1426 | } | ||
1427 | |||
1428 | // found result in $i compares | ||
1429 | |||
1430 | // rather than sorting the result, preserve it so that you can see | ||
1431 | // which paths the algorithm decided to take along the tree | ||
1432 | |||
1433 | // but sometimes the last item is only the second highest | ||
1434 | if (($this->_perl_compatible && (end($scores) > prev($scores))) | ||
1435 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) | ||
1436 | ) { | ||
1437 | $real_last_score = current($scores); | ||
1438 | $real_last_key = key($scores); | ||
1439 | |||
1440 | // swaps the 2nd-to-last item for the last item | ||
1441 | unset($scores[$real_last_key]); | ||
1442 | $scores[$real_last_key] = $real_last_score; | ||
1443 | } | ||
1444 | |||
1445 | |||
1446 | if (!$this->_perl_compatible) { | ||
1447 | $scores = array_reverse($scores, true); | ||
1448 | // second param requires php > 4.0.3 | ||
1449 | } | ||
1450 | |||
1451 | return $scores; | ||
1452 | } | ||
1453 | |||
1454 | /** | ||
1455 | * ut8-safe strlen() | ||
1456 | * | ||
1457 | * Returns the numbers of characters (not bytes) in a utf8 string | ||
1458 | * | ||
1459 | * @param string $str string to get the length of | ||
1460 | * | ||
1461 | * @return int number of chars | ||
1462 | */ | ||
1463 | public static function utf8strlen($str) | ||
1464 | { | ||
1465 | // utf8_decode() will convert unknown chars to '?', which is actually | ||
1466 | // ideal for counting. | ||
1467 | |||
1468 | return strlen(utf8_decode($str)); | ||
1469 | |||
1470 | // idea stolen from dokuwiki | ||
1471 | } | ||
1472 | |||
1473 | /** | ||
1474 | * Returns the unicode value of a utf8 char | ||
1475 | * | ||
1476 | * @param string $char a utf8 (possibly multi-byte) char | ||
1477 | * | ||
1478 | * @return int unicode value | ||
1479 | * @access protected | ||
1480 | * @link http://en.wikipedia.org/wiki/UTF-8 | ||
1481 | */ | ||
1482 | function _utf8char2unicode($char) | ||
1483 | { | ||
1484 | // strlen() here will actually get the binary length of a single char | ||
1485 | switch (strlen($char)) { | ||
1486 | case 1: | ||
1487 | // normal ASCII-7 byte | ||
1488 | // 0xxxxxxx --> 0xxxxxxx | ||
1489 | return ord($char{0}); | ||
1490 | |||
1491 | case 2: | ||
1492 | // 2 byte unicode | ||
1493 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | ||
1494 | $z = (ord($char{0}) & 0x000001F) << 6; | ||
1495 | $x = (ord($char{1}) & 0x0000003F); | ||
1496 | return ($z | $x); | ||
1497 | |||
1498 | case 3: | ||
1499 | // 3 byte unicode | ||
1500 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | ||
1501 | $z = (ord($char{0}) & 0x0000000F) << 12; | ||
1502 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | ||
1503 | $x2 = (ord($char{2}) & 0x0000003F); | ||
1504 | return ($z | $x1 | $x2); | ||
1505 | |||
1506 | case 4: | ||
1507 | // 4 byte unicode | ||
1508 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | ||
1509 | // 000zzzzz xxxxxxxx xxxxxxxx | ||
1510 | $z1 = (ord($char{0}) & 0x00000007) << 18; | ||
1511 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | ||
1512 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | ||
1513 | $x2 = (ord($char{3}) & 0x0000003F); | ||
1514 | return ($z1 | $z2 | $x1 | $x2); | ||
1515 | } | ||
1516 | } | ||
1517 | |||
1518 | /** | ||
1519 | * utf8-safe fast character iterator | ||
1520 | * | ||
1521 | * Will get the next character starting from $counter, which will then be | ||
1522 | * incremented. If a multi-byte char the bytes will be concatenated and | ||
1523 | * $counter will be incremeted by the number of bytes in the char. | ||
1524 | * | ||
1525 | * @param string $str the string being iterated over | ||
1526 | * @param int &$counter the iterator, will increment by reference | ||
1527 | * @param bool $special_convert whether to do special conversions | ||
1528 | * | ||
1529 | * @return char the next (possibly multi-byte) char from $counter | ||
1530 | * @access private | ||
1531 | */ | ||
1532 | static function _next_char($str, &$counter, $special_convert = false) | ||
1533 | { | ||
1534 | $char = $str{$counter++}; | ||
1535 | $ord = ord($char); | ||
1536 | |||
1537 | // for a description of the utf8 system see | ||
1538 | // http://www.phpclasses.org/browse/file/5131.html | ||
1539 | |||
1540 | // normal ascii one byte char | ||
1541 | if ($ord <= 127) { | ||
1542 | // special conversions needed for this package | ||
1543 | // (that only apply to regular ascii characters) | ||
1544 | // lower case, and convert all non-alphanumeric characters | ||
1545 | // other than "'" to space | ||
1546 | if ($special_convert && $char != ' ' && $char != "'") { | ||
1547 | if ($ord >= 65 && $ord <= 90) { // A-Z | ||
1548 | $char = chr($ord + 32); // lower case | ||
1549 | } elseif ($ord < 97 || $ord > 122) { // NOT a-z | ||
1550 | $char = ' '; // convert to space | ||
1551 | } | ||
1552 | } | ||
1553 | |||
1554 | return $char; | ||
1555 | |||
1556 | } elseif ($ord >> 5 == 6) { // two-byte char | ||
1557 | // multi-byte chars | ||
1558 | $nextchar = $str{$counter++}; // get next byte | ||
1559 | |||
1560 | // lower-casing of non-ascii characters is still incomplete | ||
1561 | |||
1562 | if ($special_convert) { | ||
1563 | // lower case latin accented characters | ||
1564 | if ($ord == 195) { | ||
1565 | $nextord = ord($nextchar); | ||
1566 | $nextord_adj = $nextord + 64; | ||
1567 | // for a reference, see | ||
1568 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | ||
1569 | |||
1570 | // À - Þ but not × | ||
1571 | if ($nextord_adj >= 192 | ||
1572 | && $nextord_adj <= 222 | ||
1573 | && $nextord_adj != 215 | ||
1574 | ) { | ||
1575 | $nextchar = chr($nextord + 32); | ||
1576 | } | ||
1577 | |||
1578 | } elseif ($ord == 208) { | ||
1579 | // lower case cyrillic alphabet | ||
1580 | $nextord = ord($nextchar); | ||
1581 | // if A - Pe | ||
1582 | if ($nextord >= 144 && $nextord <= 159) { | ||
1583 | // lower case | ||
1584 | $nextchar = chr($nextord + 32); | ||
1585 | |||
1586 | } elseif ($nextord >= 160 && $nextord <= 175) { | ||
1587 | // if Er - Ya | ||
1588 | // lower case | ||
1589 | $char = chr(209); // == $ord++ | ||
1590 | $nextchar = chr($nextord - 32); | ||
1591 | } | ||
1592 | } | ||
1593 | } | ||
1594 | |||
1595 | // tag on next byte | ||
1596 | return $char . $nextchar; | ||
1597 | } elseif ($ord >> 4 == 14) { // three-byte char | ||
1598 | |||
1599 | // tag on next 2 bytes | ||
1600 | return $char . $str{$counter++} . $str{$counter++}; | ||
1601 | |||
1602 | } elseif ($ord >> 3 == 30) { // four-byte char | ||
1603 | |||
1604 | // tag on next 3 bytes | ||
1605 | return $char . $str{$counter++} . $str{$counter++} . $str{$counter++}; | ||
1606 | |||
1607 | } else { | ||
1608 | // error? | ||
1609 | } | ||
1610 | } | ||
1611 | |||
1612 | /** | ||
1613 | * Converts an $language input parameter from the configured mode | ||
1614 | * to the language name that is used internally. | ||
1615 | * | ||
1616 | * Works for strings and arrays. | ||
1617 | * | ||
1618 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1619 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1620 | * converts the keys to the language name. | ||
1621 | * | ||
1622 | * @return string|array Language name | ||
1623 | */ | ||
1624 | function _convertFromNameMode($lang, $convertKey = false) | ||
1625 | { | ||
1626 | if ($this->_name_mode == 0) { | ||
1627 | return $lang; | ||
1628 | } | ||
1629 | |||
1630 | if ($this->_name_mode == 2) { | ||
1631 | $method = 'code2ToName'; | ||
1632 | } else { | ||
1633 | $method = 'code3ToName'; | ||
1634 | } | ||
1635 | |||
1636 | if (is_string($lang)) { | ||
1637 | return (string)Text_LanguageDetect_ISO639::$method($lang); | ||
1638 | } | ||
1639 | |||
1640 | $newlang = array(); | ||
1641 | foreach ($lang as $key => $val) { | ||
1642 | if ($convertKey) { | ||
1643 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | ||
1644 | $newlang[$newkey] = $val; | ||
1645 | } else { | ||
1646 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | ||
1647 | } | ||
1648 | } | ||
1649 | return $newlang; | ||
1650 | } | ||
1651 | |||
1652 | /** | ||
1653 | * Converts an $language output parameter from the language name that is | ||
1654 | * used internally to the configured mode. | ||
1655 | * | ||
1656 | * Works for strings and arrays. | ||
1657 | * | ||
1658 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1659 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1660 | * converts the keys to the language name. | ||
1661 | * | ||
1662 | * @return string|array Language name | ||
1663 | */ | ||
1664 | function _convertToNameMode($lang, $convertKey = false) | ||
1665 | { | ||
1666 | if ($this->_name_mode == 0) { | ||
1667 | return $lang; | ||
1668 | } | ||
1669 | |||
1670 | if ($this->_name_mode == 2) { | ||
1671 | $method = 'nameToCode2'; | ||
1672 | } else { | ||
1673 | $method = 'nameToCode3'; | ||
1674 | } | ||
1675 | |||
1676 | if (is_string($lang)) { | ||
1677 | return Text_LanguageDetect_ISO639::$method($lang); | ||
1678 | } | ||
1679 | |||
1680 | $newlang = array(); | ||
1681 | foreach ($lang as $key => $val) { | ||
1682 | if ($convertKey) { | ||
1683 | $newkey = Text_LanguageDetect_ISO639::$method($key); | ||
1684 | $newlang[$newkey] = $val; | ||
1685 | } else { | ||
1686 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | ||
1687 | } | ||
1688 | } | ||
1689 | return $newlang; | ||
1690 | } | ||
1691 | } | ||
1692 | |||
1693 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file | ||