diff options
author | Nicolas LÅ“uillet <nicolas.loeuillet@gmail.com> | 2013-12-06 00:49:43 -0800 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas.loeuillet@gmail.com> | 2013-12-06 00:49:43 -0800 |
commit | d5501950e2470d52f6bf5954d2179010cdee0475 (patch) | |
tree | c452a0cbcbe3a49fa72e5ba7e8c249302da8ae29 /inc/3rdparty/libraries/language-detect/LanguageDetect.php | |
parent | 0b5c6ff3195e145a1fb5edb67741b8f6ed231fa2 (diff) | |
parent | 42c80841c846610be280218d53fcde06b0f0063b (diff) | |
download | wallabag-d5501950e2470d52f6bf5954d2179010cdee0475.tar.gz wallabag-d5501950e2470d52f6bf5954d2179010cdee0475.tar.zst wallabag-d5501950e2470d52f6bf5954d2179010cdee0475.zip |
Merge pull request #353 from inthepoche/ftr
[change] we now use Full-Text RSS 3.1, thank you so much @fivefilters
Diffstat (limited to 'inc/3rdparty/libraries/language-detect/LanguageDetect.php')
-rw-r--r-- | inc/3rdparty/libraries/language-detect/LanguageDetect.php | 1635 |
1 files changed, 1635 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php new file mode 100644 index 00000000..09b11546 --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php | |||
@@ -0,0 +1,1635 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Detects the language of a given piece of text. | ||
5 | * | ||
6 | * Attempts to detect the language of a sample of text by correlating ranked | ||
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | ||
8 | * | ||
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | ||
10 | * (1994): "N-Gram-Based Text Categorization" | ||
11 | * | ||
12 | * PHP versions 4 and 5 | ||
13 | * | ||
14 | * @category Text | ||
15 | * @package Text_LanguageDetect | ||
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | ||
17 | * @copyright 2005-2006 Nicholas Pisarro | ||
18 | * @license http://www.debian.org/misc/bsd.license BSD | ||
19 | * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ | ||
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
21 | * @link http://langdetect.blogspot.com/ | ||
22 | */ | ||
23 | |||
24 | //require_once 'PEAR.php'; | ||
25 | require_once 'Parser.php'; | ||
26 | |||
27 | /** | ||
28 | * Language detection class | ||
29 | * | ||
30 | * Requires the langauge model database (lang.dat) that should have | ||
31 | * accompanied this class definition in order to be instantiated. | ||
32 | * | ||
33 | * Example usage: | ||
34 | * | ||
35 | * <code> | ||
36 | * require_once 'Text/LanguageDetect.php'; | ||
37 | * | ||
38 | * $l = new Text_LanguageDetect; | ||
39 | * | ||
40 | * $stdin = fopen('php://stdin', 'r'); | ||
41 | * | ||
42 | * echo "Supported languages:\n"; | ||
43 | * | ||
44 | * $langs = $l->getLanguages(); | ||
45 | * if (PEAR::isError($langs)) { | ||
46 | * die($langs->getMessage()); | ||
47 | * } | ||
48 | * | ||
49 | * sort($langs); | ||
50 | * echo join(', ', $langs); | ||
51 | * | ||
52 | * while ($line = fgets($stdin)) { | ||
53 | * print_r($l->detect($line, 4)); | ||
54 | * } | ||
55 | * </code> | ||
56 | * | ||
57 | * @category Text | ||
58 | * @package Text_LanguageDetect | ||
59 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | ||
60 | * @copyright 2005 Nicholas Pisarro | ||
61 | * @license http://www.debian.org/misc/bsd.license BSD | ||
62 | * @version Release: @package_version@ | ||
63 | * @todo allow users to generate their own language models | ||
64 | */ | ||
65 | |||
66 | class Text_LanguageDetect | ||
67 | { | ||
68 | /** | ||
69 | * The filename that stores the trigram data for the detector | ||
70 | * | ||
71 | * If this value starts with a slash (/) or a dot (.) the value of | ||
72 | * $this->_data_dir will be ignored | ||
73 | * | ||
74 | * @var string | ||
75 | * @access private | ||
76 | */ | ||
77 | var $_db_filename = './lang.dat'; | ||
78 | |||
79 | /** | ||
80 | * The filename that stores the unicode block definitions | ||
81 | * | ||
82 | * If this value starts with a slash (/) or a dot (.) the value of | ||
83 | * $this->_data_dir will be ignored | ||
84 | * | ||
85 | * @var string | ||
86 | * @access private | ||
87 | */ | ||
88 | var $_unicode_db_filename = './unicode_blocks.dat'; | ||
89 | |||
90 | /** | ||
91 | * The data directory | ||
92 | * | ||
93 | * Should be set by PEAR installer | ||
94 | * | ||
95 | * @var string | ||
96 | * @access private | ||
97 | */ | ||
98 | var $_data_dir = '@data_dir@'; | ||
99 | |||
100 | /** | ||
101 | * The trigram data for comparison | ||
102 | * | ||
103 | * Will be loaded on start from $this->_db_filename | ||
104 | * | ||
105 | * May be set to a PEAR_Error object if there is an error during its | ||
106 | * initialization | ||
107 | * | ||
108 | * @var array | ||
109 | * @access private | ||
110 | */ | ||
111 | var $_lang_db = array(); | ||
112 | |||
113 | /** | ||
114 | * stores the map of the trigram data to unicode characters | ||
115 | * | ||
116 | * @access private | ||
117 | * @var array | ||
118 | */ | ||
119 | var $_unicode_map; | ||
120 | |||
121 | /** | ||
122 | * The size of the trigram data arrays | ||
123 | * | ||
124 | * @var int | ||
125 | * @access private | ||
126 | */ | ||
127 | var $_threshold = 300; | ||
128 | |||
129 | /** | ||
130 | * the maximum possible score. | ||
131 | * | ||
132 | * needed for score normalization. Different depending on the | ||
133 | * perl compatibility setting | ||
134 | * | ||
135 | * @access private | ||
136 | * @var int | ||
137 | * @see setPerlCompatible() | ||
138 | */ | ||
139 | var $_max_score = 0; | ||
140 | |||
141 | /** | ||
142 | * Whether or not to simulate perl's Language::Guess exactly | ||
143 | * | ||
144 | * @access private | ||
145 | * @var bool | ||
146 | * @see setPerlCompatible() | ||
147 | */ | ||
148 | var $_perl_compatible = false; | ||
149 | |||
150 | /** | ||
151 | * Whether to use the unicode block detection to speed up processing | ||
152 | * | ||
153 | * @access private | ||
154 | * @var bool | ||
155 | */ | ||
156 | var $_use_unicode_narrowing = true; | ||
157 | |||
158 | /** | ||
159 | * stores the result of the clustering operation | ||
160 | * | ||
161 | * @access private | ||
162 | * @var array | ||
163 | * @see clusterLanguages() | ||
164 | */ | ||
165 | var $_clusters; | ||
166 | |||
167 | /** | ||
168 | * Constructor | ||
169 | * | ||
170 | * Will attempt to load the language database. If it fails, you will get | ||
171 | * a PEAR_Error object returned when you try to use detect() | ||
172 | * | ||
173 | */ | ||
174 | function Text_LanguageDetect($db=null, $unicode_db=null) | ||
175 | { | ||
176 | if (isset($db)) $this->_db_filename = $db; | ||
177 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
178 | |||
179 | $data = $this->_readdb($this->_db_filename); | ||
180 | $this->_lang_db = $data['trigram']; | ||
181 | |||
182 | if (isset($data['trigram-unicodemap'])) { | ||
183 | $this->_unicode_map = $data['trigram-unicodemap']; | ||
184 | } | ||
185 | |||
186 | // Not yet implemented: | ||
187 | if (isset($data['trigram-clusters'])) { | ||
188 | $this->_clusters = $data['trigram-clusters']; | ||
189 | } | ||
190 | } | ||
191 | |||
192 | /** | ||
193 | * Returns the path to the location of the database | ||
194 | * | ||
195 | * @access private | ||
196 | * @return string expected path to the language model database | ||
197 | */ | ||
198 | function _get_data_loc($fname) | ||
199 | { | ||
200 | return $fname; | ||
201 | } | ||
202 | |||
203 | /** | ||
204 | * Loads the language trigram database from filename | ||
205 | * | ||
206 | * Trigram datbase should be a serialize()'d array | ||
207 | * | ||
208 | * @access private | ||
209 | * @param string $fname the filename where the data is stored | ||
210 | * @return array the language model data | ||
211 | * @throws PEAR_Error | ||
212 | */ | ||
213 | function _readdb($fname) | ||
214 | { | ||
215 | // finds the correct data dir | ||
216 | $fname = $this->_get_data_loc($fname); | ||
217 | |||
218 | // input check | ||
219 | if (!file_exists($fname)) { | ||
220 | throw new Exception('Language database does not exist.'); | ||
221 | } elseif (!is_readable($fname)) { | ||
222 | throw new Exception('Language database is not readable.'); | ||
223 | } | ||
224 | |||
225 | if (function_exists('file_get_contents')) { | ||
226 | return unserialize(file_get_contents($fname)); | ||
227 | } else { | ||
228 | // if you don't have file_get_contents(), | ||
229 | // then this is the next fastest way | ||
230 | ob_start(); | ||
231 | readfile($fname); | ||
232 | $contents = ob_get_contents(); | ||
233 | ob_end_clean(); | ||
234 | return unserialize($contents); | ||
235 | } | ||
236 | } | ||
237 | |||
238 | |||
239 | /** | ||
240 | * Checks if this object is ready to detect languages | ||
241 | * | ||
242 | * @access private | ||
243 | * @param mixed &$err error object to be returned by reference, if any | ||
244 | * @return bool true if no errors | ||
245 | */ | ||
246 | function _setup_ok(&$err) | ||
247 | { | ||
248 | if (!is_array($this->_lang_db)) { | ||
249 | if (ini_get('magic_quotes_runtime')) { | ||
250 | throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); | ||
251 | } else { | ||
252 | throw new Exception('Language database is not an array.'); | ||
253 | } | ||
254 | return false; | ||
255 | |||
256 | } elseif (empty($this->_lang_db)) { | ||
257 | throw new Exception('Language database has no elements.'); | ||
258 | return false; | ||
259 | |||
260 | } else { | ||
261 | return true; | ||
262 | } | ||
263 | } | ||
264 | |||
265 | /** | ||
266 | * Omits languages | ||
267 | * | ||
268 | * Pass this function the name of or an array of names of | ||
269 | * languages that you don't want considered | ||
270 | * | ||
271 | * If you're only expecting a limited set of languages, this can greatly | ||
272 | * speed up processing | ||
273 | * | ||
274 | * @access public | ||
275 | * @param mixed $omit_list language name or array of names to omit | ||
276 | * @param bool $include_only if true will include (rather than | ||
277 | * exclude) only those in the list | ||
278 | * @return int number of languages successfully deleted | ||
279 | * @throws PEAR_Error | ||
280 | */ | ||
281 | function omitLanguages($omit_list, $include_only = false) | ||
282 | { | ||
283 | |||
284 | // setup check | ||
285 | if (!$this->_setup_ok($err)) { | ||
286 | return $err; | ||
287 | } | ||
288 | |||
289 | $deleted = 0; | ||
290 | |||
291 | // deleting the given languages | ||
292 | if (!$include_only) { | ||
293 | if (!is_array($omit_list)) { | ||
294 | $omit_list = strtolower($omit_list); // case desensitize | ||
295 | if (isset($this->_lang_db[$omit_list])) { | ||
296 | unset($this->_lang_db[$omit_list]); | ||
297 | $deleted++; | ||
298 | } | ||
299 | } else { | ||
300 | foreach ($omit_list as $omit_lang) { | ||
301 | if (isset($this->_lang_db[$omit_lang])) { | ||
302 | unset($this->_lang_db[$omit_lang]); | ||
303 | $deleted++; | ||
304 | } | ||
305 | } | ||
306 | } | ||
307 | |||
308 | // deleting all except the given languages | ||
309 | } else { | ||
310 | if (!is_array($omit_list)) { | ||
311 | $omit_list = array($omit_list); | ||
312 | } | ||
313 | |||
314 | // case desensitize | ||
315 | foreach ($omit_list as $key => $omit_lang) { | ||
316 | $omit_list[$key] = strtolower($omit_lang); | ||
317 | } | ||
318 | |||
319 | foreach (array_keys($this->_lang_db) as $lang) { | ||
320 | if (!in_array($lang, $omit_list)) { | ||
321 | unset($this->_lang_db[$lang]); | ||
322 | $deleted++; | ||
323 | } | ||
324 | } | ||
325 | } | ||
326 | |||
327 | // reset the cluster cache if the number of languages changes | ||
328 | // this will then have to be recalculated | ||
329 | if (isset($this->_clusters) && $deleted > 0) { | ||
330 | unset($this->_clusters); | ||
331 | } | ||
332 | |||
333 | return $deleted; | ||
334 | } | ||
335 | |||
336 | |||
337 | /** | ||
338 | * Returns the number of languages that this object can detect | ||
339 | * | ||
340 | * @access public | ||
341 | * @return int the number of languages | ||
342 | * @throws PEAR_Error | ||
343 | */ | ||
344 | function getLanguageCount() | ||
345 | { | ||
346 | if (!$this->_setup_ok($err)) { | ||
347 | return $err; | ||
348 | } else { | ||
349 | return count($this->_lang_db); | ||
350 | } | ||
351 | } | ||
352 | |||
353 | /** | ||
354 | * Returns true if a given language exists | ||
355 | * | ||
356 | * If passed an array of names, will return true only if all exist | ||
357 | * | ||
358 | * @access public | ||
359 | * @param mixed $lang language name or array of language names | ||
360 | * @return bool true if language model exists | ||
361 | * @throws PEAR_Error | ||
362 | */ | ||
363 | function languageExists($lang) | ||
364 | { | ||
365 | if (!$this->_setup_ok($err)) { | ||
366 | return $err; | ||
367 | } else { | ||
368 | // string | ||
369 | if (is_string($lang)) { | ||
370 | return isset($this->_lang_db[strtolower($lang)]); | ||
371 | |||
372 | // array | ||
373 | } elseif (is_array($lang)) { | ||
374 | foreach ($lang as $test_lang) { | ||
375 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
376 | return false; | ||
377 | } | ||
378 | } | ||
379 | return true; | ||
380 | |||
381 | // other (error) | ||
382 | } else { | ||
383 | throw new Exception('Unknown type passed to languageExists()'); | ||
384 | } | ||
385 | } | ||
386 | } | ||
387 | |||
388 | /** | ||
389 | * Returns the list of detectable languages | ||
390 | * | ||
391 | * @access public | ||
392 | * @return array the names of the languages known to this object | ||
393 | * @throws PEAR_Error | ||
394 | */ | ||
395 | function getLanguages() | ||
396 | { | ||
397 | if (!$this->_setup_ok($err)) { | ||
398 | return $err; | ||
399 | } else { | ||
400 | return array_keys($this->_lang_db); | ||
401 | } | ||
402 | } | ||
403 | |||
404 | /** | ||
405 | * Make this object behave like Language::Guess | ||
406 | * | ||
407 | * @access public | ||
408 | * @param bool $setting false to turn off perl compatibility | ||
409 | */ | ||
410 | function setPerlCompatible($setting = true) | ||
411 | { | ||
412 | if (is_bool($setting)) { // input check | ||
413 | $this->_perl_compatible = $setting; | ||
414 | |||
415 | if ($setting == true) { | ||
416 | $this->_max_score = $this->_threshold; | ||
417 | } else { | ||
418 | $this->_max_score = 0; | ||
419 | } | ||
420 | } | ||
421 | |||
422 | } | ||
423 | |||
424 | /** | ||
425 | * Whether to use unicode block ranges in detection | ||
426 | * | ||
427 | * Should speed up most detections if turned on (detault is on). In some | ||
428 | * circumstances it may be slower, such as for large text samples (> 10K) | ||
429 | * in languages that use latin scripts. In other cases it should speed up | ||
430 | * detection noticeably. | ||
431 | * | ||
432 | * @access public | ||
433 | * @param bool $setting false to turn off | ||
434 | */ | ||
435 | function useUnicodeBlocks($setting = true) | ||
436 | { | ||
437 | if (is_bool($setting)) { | ||
438 | $this->_use_unicode_narrowing = $setting; | ||
439 | } | ||
440 | } | ||
441 | |||
442 | /** | ||
443 | * Converts a piece of text into trigrams | ||
444 | * | ||
445 | * Superceded by the Text_LanguageDetect_Parser class | ||
446 | * | ||
447 | * @access private | ||
448 | * @param string $text text to convert | ||
449 | * @return array array of trigram frequencies | ||
450 | */ | ||
451 | function _trigram($text) | ||
452 | { | ||
453 | $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); | ||
454 | $s->prepareTrigram(); | ||
455 | $s->prepareUnicode(false); | ||
456 | $s->setPadStart(!$this->_perl_compatible); | ||
457 | $s->analyze(); | ||
458 | return $s->getTrigramFreqs(); | ||
459 | } | ||
460 | |||
461 | /** | ||
462 | * Converts a set of trigrams from frequencies to ranks | ||
463 | * | ||
464 | * Thresholds (cuts off) the list at $this->_threshold | ||
465 | * | ||
466 | * @access protected | ||
467 | * @param array $arr array of trgram | ||
468 | * @return array ranks of trigrams | ||
469 | */ | ||
470 | function _arr_rank(&$arr) | ||
471 | { | ||
472 | |||
473 | // sorts alphabetically first as a standard way of breaking rank ties | ||
474 | $this->_bub_sort($arr); | ||
475 | |||
476 | // below might also work, but seemed to introduce errors in testing | ||
477 | //ksort($arr); | ||
478 | //asort($arr); | ||
479 | |||
480 | $rank = array(); | ||
481 | |||
482 | $i = 0; | ||
483 | foreach ($arr as $key => $value) { | ||
484 | $rank[$key] = $i++; | ||
485 | |||
486 | // cut off at a standard threshold | ||
487 | if ($i >= $this->_threshold) { | ||
488 | break; | ||
489 | } | ||
490 | } | ||
491 | |||
492 | return $rank; | ||
493 | } | ||
494 | |||
495 | /** | ||
496 | * Sorts an array by value breaking ties alphabetically | ||
497 | * | ||
498 | * @access private | ||
499 | * @param array &$arr the array to sort | ||
500 | */ | ||
501 | function _bub_sort(&$arr) | ||
502 | { | ||
503 | // should do the same as this perl statement: | ||
504 | // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | ||
505 | |||
506 | // needs to sort by both key and value at once | ||
507 | // using the key to break ties for the value | ||
508 | |||
509 | // converts array into an array of arrays of each key and value | ||
510 | // may be a better way of doing this | ||
511 | $combined = array(); | ||
512 | |||
513 | foreach ($arr as $key => $value) { | ||
514 | $combined[] = array($key, $value); | ||
515 | } | ||
516 | |||
517 | usort($combined, array($this, '_sort_func')); | ||
518 | |||
519 | $replacement = array(); | ||
520 | foreach ($combined as $key => $value) { | ||
521 | list($new_key, $new_value) = $value; | ||
522 | $replacement[$new_key] = $new_value; | ||
523 | } | ||
524 | |||
525 | $arr = $replacement; | ||
526 | } | ||
527 | |||
528 | /** | ||
529 | * Sort function used by bubble sort | ||
530 | * | ||
531 | * Callback function for usort(). | ||
532 | * | ||
533 | * @access private | ||
534 | * @param array first param passed by usort() | ||
535 | * @param array second param passed by usort() | ||
536 | * @return int 1 if $a is greater, -1 if not | ||
537 | * @see _bub_sort() | ||
538 | */ | ||
539 | function _sort_func($a, $b) | ||
540 | { | ||
541 | // each is actually a key/value pair, so that it can compare using both | ||
542 | list($a_key, $a_value) = $a; | ||
543 | list($b_key, $b_value) = $b; | ||
544 | |||
545 | // if the values are the same, break ties using the key | ||
546 | if ($a_value == $b_value) { | ||
547 | return strcmp($a_key, $b_key); | ||
548 | |||
549 | // if not, just sort normally | ||
550 | } else { | ||
551 | if ($a_value > $b_value) { | ||
552 | return -1; | ||
553 | } else { | ||
554 | return 1; | ||
555 | } | ||
556 | } | ||
557 | |||
558 | // 0 should not be possible because keys must be unique | ||
559 | } | ||
560 | |||
561 | /** | ||
562 | * Calculates a linear rank-order distance statistic between two sets of | ||
563 | * ranked trigrams | ||
564 | * | ||
565 | * Sums the differences in rank for each trigram. If the trigram does not | ||
566 | * appear in both, consider it a difference of $this->_threshold. | ||
567 | * | ||
568 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | ||
569 | * its simplicity it has been shown to be highly accurate for language | ||
570 | * identification tasks. | ||
571 | * | ||
572 | * @access private | ||
573 | * @param array $arr1 the reference set of trigram ranks | ||
574 | * @param array $arr2 the target set of trigram ranks | ||
575 | * @return int the sum of the differences between the ranks of | ||
576 | * the two trigram sets | ||
577 | */ | ||
578 | function _distance(&$arr1, &$arr2) | ||
579 | { | ||
580 | $sumdist = 0; | ||
581 | |||
582 | foreach ($arr2 as $key => $value) { | ||
583 | if (isset($arr1[$key])) { | ||
584 | $distance = abs($value - $arr1[$key]); | ||
585 | } else { | ||
586 | // $this->_threshold sets the maximum possible distance value | ||
587 | // for any one pair of trigrams | ||
588 | $distance = $this->_threshold; | ||
589 | } | ||
590 | $sumdist += $distance; | ||
591 | } | ||
592 | |||
593 | return $sumdist; | ||
594 | |||
595 | // todo: there are other distance statistics to try, e.g. relative | ||
596 | // entropy, but they're probably more costly to compute | ||
597 | } | ||
598 | |||
599 | /** | ||
600 | * Normalizes the score returned by _distance() | ||
601 | * | ||
602 | * Different if perl compatible or not | ||
603 | * | ||
604 | * @access private | ||
605 | * @param int $score the score from _distance() | ||
606 | * @param int $base_count the number of trigrams being considered | ||
607 | * @return float the normalized score | ||
608 | * @see _distance() | ||
609 | */ | ||
610 | function _normalize_score($score, $base_count = null) | ||
611 | { | ||
612 | if ($base_count === null) { | ||
613 | $base_count = $this->_threshold; | ||
614 | } | ||
615 | |||
616 | if (!$this->_perl_compatible) { | ||
617 | return 1 - ($score / $base_count / $this->_threshold); | ||
618 | } else { | ||
619 | return floor($score / $base_count); | ||
620 | } | ||
621 | } | ||
622 | |||
623 | |||
624 | /** | ||
625 | * Detects the closeness of a sample of text to the known languages | ||
626 | * | ||
627 | * Calculates the statistical difference between the text and | ||
628 | * the trigrams for each language, normalizes the score then | ||
629 | * returns results for all languages in sorted order | ||
630 | * | ||
631 | * If perl compatible, the score is 300-0, 0 being most similar. | ||
632 | * Otherwise, it's 0-1 with 1 being most similar. | ||
633 | * | ||
634 | * The $sample text should be at least a few sentences in length; | ||
635 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | ||
636 | * is present it will try to detect and convert. However, experience has | ||
637 | * shown that mb_detect_encoding() *does not work very well* with at least | ||
638 | * some types of encoding. | ||
639 | * | ||
640 | * @access public | ||
641 | * @param string $sample a sample of text to compare. | ||
642 | * @param int $limit if specified, return an array of the most likely | ||
643 | * $limit languages and their scores. | ||
644 | * @return mixed sorted array of language scores, blank array if no | ||
645 | * useable text was found, or PEAR_Error if error | ||
646 | * with the object setup | ||
647 | * @see _distance() | ||
648 | * @throws PEAR_Error | ||
649 | */ | ||
650 | function detect($sample, $limit = 0) | ||
651 | { | ||
652 | if (!$this->_setup_ok($err)) { | ||
653 | return $err; | ||
654 | } | ||
655 | |||
656 | // input check | ||
657 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | ||
658 | return array(); | ||
659 | } | ||
660 | |||
661 | // check char encoding | ||
662 | // (only if mbstring extension is compiled and PHP > 4.0.6) | ||
663 | if (function_exists('mb_detect_encoding') | ||
664 | && function_exists('mb_convert_encoding')) { | ||
665 | |||
666 | // mb_detect_encoding isn't very reliable, to say the least | ||
667 | // detection should still work with a sufficient sample of ascii characters | ||
668 | $encoding = mb_detect_encoding($sample); | ||
669 | |||
670 | // mb_detect_encoding() will return FALSE if detection fails | ||
671 | // don't attempt conversion if that's the case | ||
672 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { | ||
673 | |||
674 | if (function_exists('mb_list_encodings')) { | ||
675 | |||
676 | // verify the encoding exists in mb_list_encodings | ||
677 | if (in_array($encoding, mb_list_encodings())) { | ||
678 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
679 | } | ||
680 | |||
681 | // if the previous condition failed: | ||
682 | // somehow we detected an encoding that also we don't support | ||
683 | |||
684 | } else { | ||
685 | // php 4 doesnt have mb_list_encodings() | ||
686 | // so attempt with error suppression | ||
687 | $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
688 | } | ||
689 | } | ||
690 | } | ||
691 | |||
692 | $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); | ||
693 | $sample_obj->prepareTrigram(); | ||
694 | if ($this->_use_unicode_narrowing) { | ||
695 | $sample_obj->prepareUnicode(); | ||
696 | } | ||
697 | $sample_obj->setPadStart(!$this->_perl_compatible); | ||
698 | $sample_obj->analyze(); | ||
699 | |||
700 | $trigram_freqs =& $sample_obj->getTrigramRanks(); | ||
701 | $trigram_count = count($trigram_freqs); | ||
702 | |||
703 | if ($trigram_count == 0) { | ||
704 | return array(); | ||
705 | } | ||
706 | |||
707 | $scores = array(); | ||
708 | |||
709 | // use unicode block detection to narrow down the possibilities | ||
710 | if ($this->_use_unicode_narrowing) { | ||
711 | $blocks =& $sample_obj->getUnicodeBlocks(); | ||
712 | |||
713 | if (is_array($blocks)) { | ||
714 | $present_blocks = array_keys($blocks); | ||
715 | } else { | ||
716 | throw new Exception('Error during block detection'); | ||
717 | } | ||
718 | |||
719 | $possible_langs = array(); | ||
720 | |||
721 | foreach ($present_blocks as $blockname) { | ||
722 | if (isset($this->_unicode_map[$blockname])) { | ||
723 | |||
724 | $possible_langs = array_merge( | ||
725 | $possible_langs, | ||
726 | array_keys($this->_unicode_map[$blockname]) | ||
727 | ); | ||
728 | |||
729 | // todo: faster way to do this? | ||
730 | } | ||
731 | } | ||
732 | |||
733 | // could also try an intersect operation rather than a union | ||
734 | // in other words, choose languages whose trigrams contain | ||
735 | // ALL of the unicode blocks found in this sample | ||
736 | // would improve speed but would be completely thrown off by an | ||
737 | // unexpected character, like an umlaut appearing in english text | ||
738 | |||
739 | $possible_langs = array_intersect( | ||
740 | array_keys($this->_lang_db), | ||
741 | array_unique($possible_langs) | ||
742 | ); | ||
743 | |||
744 | // needs to intersect it with the keys of _lang_db in case | ||
745 | // languages have been omitted | ||
746 | |||
747 | // or just try 'em all | ||
748 | } else { | ||
749 | $possible_langs = array_keys($this->_lang_db); | ||
750 | } | ||
751 | |||
752 | |||
753 | foreach ($possible_langs as $lang) { | ||
754 | $scores[$lang] = | ||
755 | $this->_normalize_score( | ||
756 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | ||
757 | $trigram_count); | ||
758 | } | ||
759 | |||
760 | unset($sample_obj); | ||
761 | |||
762 | if ($this->_perl_compatible) { | ||
763 | asort($scores); | ||
764 | } else { | ||
765 | arsort($scores); | ||
766 | } | ||
767 | |||
768 | // todo: drop languages with a score of $this->_max_score? | ||
769 | |||
770 | // limit the number of returned scores | ||
771 | if ($limit && is_numeric($limit)) { | ||
772 | $limited_scores = array(); | ||
773 | |||
774 | $i = 0; | ||
775 | |||
776 | foreach ($scores as $key => $value) { | ||
777 | if ($i++ >= $limit) { | ||
778 | break; | ||
779 | } | ||
780 | |||
781 | $limited_scores[$key] = $value; | ||
782 | } | ||
783 | |||
784 | return $limited_scores; | ||
785 | } else { | ||
786 | return $scores; | ||
787 | } | ||
788 | } | ||
789 | |||
790 | /** | ||
791 | * Returns only the most similar language to the text sample | ||
792 | * | ||
793 | * Calls $this->detect() and returns only the top result | ||
794 | * | ||
795 | * @access public | ||
796 | * @param string $sample text to detect the language of | ||
797 | * @return string the name of the most likely language | ||
798 | * or null if no language is similar | ||
799 | * @see detect() | ||
800 | * @throws PEAR_Error | ||
801 | */ | ||
802 | function detectSimple($sample) | ||
803 | { | ||
804 | $scores = $this->detect($sample, 1); | ||
805 | |||
806 | // if top language has the maximum possible score, | ||
807 | // then the top score will have been picked at random | ||
808 | if ( !is_array($scores) | ||
809 | || empty($scores) | ||
810 | || current($scores) == $this->_max_score) { | ||
811 | |||
812 | return null; | ||
813 | |||
814 | } else { | ||
815 | return ucfirst(key($scores)); | ||
816 | } | ||
817 | } | ||
818 | |||
819 | /** | ||
820 | * Returns an array containing the most similar language and a confidence | ||
821 | * rating | ||
822 | * | ||
823 | * Confidence is a simple measure calculated from the similarity score | ||
824 | * minus the similarity score from the next most similar language | ||
825 | * divided by the highest possible score. Languages that have closely | ||
826 | * related cousins (e.g. Norwegian and Danish) should generally have lower | ||
827 | * confidence scores. | ||
828 | * | ||
829 | * The similarity score answers the question "How likely is the text the | ||
830 | * returned language regardless of the other languages considered?" The | ||
831 | * confidence score is one way of answering the question "how likely is the | ||
832 | * text the detected language relative to the rest of the language model | ||
833 | * set?" | ||
834 | * | ||
835 | * To see how similar languages are a priori, see languageSimilarity() | ||
836 | * | ||
837 | * @access public | ||
838 | * @param string $sample text for which language will be detected | ||
839 | * @return array most similar language, score and confidence rating | ||
840 | * or null if no language is similar | ||
841 | * @see detect() | ||
842 | * @throws PEAR_Error | ||
843 | */ | ||
844 | function detectConfidence($sample) | ||
845 | { | ||
846 | $scores = $this->detect($sample, 2); | ||
847 | |||
848 | // if most similar language has the max score, it | ||
849 | // will have been picked at random | ||
850 | if ( !is_array($scores) | ||
851 | || empty($scores) | ||
852 | || current($scores) == $this->_max_score) { | ||
853 | |||
854 | return null; | ||
855 | } | ||
856 | |||
857 | $arr['language'] = ucfirst(key($scores)); | ||
858 | $arr['similarity'] = current($scores); | ||
859 | if (next($scores) !== false) { // if false then no next element | ||
860 | // the goal is to return a higher value if the distance between | ||
861 | // the similarity of the first score and the second score is high | ||
862 | |||
863 | if ($this->_perl_compatible) { | ||
864 | |||
865 | $arr['confidence'] = | ||
866 | (current($scores) - $arr['similarity']) / $this->_max_score; | ||
867 | |||
868 | } else { | ||
869 | |||
870 | $arr['confidence'] = $arr['similarity'] - current($scores); | ||
871 | |||
872 | } | ||
873 | |||
874 | } else { | ||
875 | $arr['confidence'] = null; | ||
876 | } | ||
877 | |||
878 | return $arr; | ||
879 | } | ||
880 | |||
881 | /** | ||
882 | * Returns the distribution of unicode blocks in a given utf8 string | ||
883 | * | ||
884 | * For the block name of a single char, use unicodeBlockName() | ||
885 | * | ||
886 | * @access public | ||
887 | * @param string $str input string. Must be ascii or utf8 | ||
888 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | ||
889 | * non-printing characters. Includes spaces, | ||
890 | * newlines and common punctutation characters. | ||
891 | * @return array | ||
892 | * @throws PEAR_Error | ||
893 | */ | ||
894 | function detectUnicodeBlocks($str, $skip_symbols) | ||
895 | { | ||
896 | // input check | ||
897 | if (!is_bool($skip_symbols)) { | ||
898 | throw new Exception('Second parameter must be boolean'); | ||
899 | } | ||
900 | |||
901 | if (!is_string($str)) { | ||
902 | throw new Exception('First parameter was not a string'); | ||
903 | } | ||
904 | |||
905 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | ||
906 | $sample_obj->prepareUnicode(); | ||
907 | $sample_obj->prepareTrigram(false); | ||
908 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | ||
909 | $sample_obj->analyze(); | ||
910 | $blocks =& $sample_obj->getUnicodeBlocks(); | ||
911 | unset($sample_obj); | ||
912 | return $blocks; | ||
913 | } | ||
914 | |||
915 | /** | ||
916 | * Returns the block name for a given unicode value | ||
917 | * | ||
918 | * If passed a string, will assume it is being passed a UTF8-formatted | ||
919 | * character and will automatically convert. Otherwise it will assume it | ||
920 | * is being passed a numeric unicode value. | ||
921 | * | ||
922 | * Make sure input is of the correct type! | ||
923 | * | ||
924 | * @access public | ||
925 | * @param mixed $unicode unicode value or utf8 char | ||
926 | * @return mixed the block name string or false if not found | ||
927 | * @throws PEAR_Error | ||
928 | */ | ||
929 | function unicodeBlockName($unicode) { | ||
930 | if (is_string($unicode)) { | ||
931 | // assume it is being passed a utf8 char, so convert it | ||
932 | |||
933 | // input check | ||
934 | if ($this->utf8strlen($unicode) > 1) { | ||
935 | throw new Exception('Pass this function only a single char'); | ||
936 | } | ||
937 | |||
938 | $unicode = $this->_utf8char2unicode($unicode); | ||
939 | |||
940 | if ($unicode == -1) { | ||
941 | throw new Exception('Malformatted char'); | ||
942 | } | ||
943 | |||
944 | // input check | ||
945 | } elseif (!is_int($unicode)) { | ||
946 | throw new Exception('Input must be of type string or int.'); | ||
947 | } | ||
948 | |||
949 | $blocks =& $this->_read_unicode_block_db(); | ||
950 | |||
951 | $result = $this->_unicode_block_name($unicode, $blocks); | ||
952 | |||
953 | if ($result == -1) { | ||
954 | return false; | ||
955 | } else { | ||
956 | return $result[2]; | ||
957 | } | ||
958 | } | ||
959 | |||
960 | /** | ||
961 | * Searches the unicode block database | ||
962 | * | ||
963 | * Returns the block name for a given unicode value. unicodeBlockName() is | ||
964 | * the public interface for this function, which does input checks which | ||
965 | * this function omits for speed. | ||
966 | * | ||
967 | * @access protected | ||
968 | * @param int $unicode the unicode value | ||
969 | * @param array &$blocks the block database | ||
970 | * @param int $block_count the number of defined blocks in the database | ||
971 | * @see unicodeBlockName() | ||
972 | */ | ||
973 | function _unicode_block_name($unicode, &$blocks, $block_count = -1) { | ||
974 | // for a reference, see | ||
975 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | ||
976 | |||
977 | // assume that ascii characters are the most common | ||
978 | // so try it first for efficiency | ||
979 | if ($unicode <= $blocks[0][1]) { | ||
980 | return $blocks[0]; | ||
981 | } | ||
982 | |||
983 | // the optional $block_count param is for efficiency | ||
984 | // so we this function doesn't have to run count() every time | ||
985 | if ($block_count != -1) { | ||
986 | $high = $block_count - 1; | ||
987 | } else { | ||
988 | $high = count($blocks) - 1; | ||
989 | } | ||
990 | |||
991 | $low = 1; // start with 1 because ascii was 0 | ||
992 | |||
993 | // your average binary search algorithm | ||
994 | while ($low <= $high) { | ||
995 | $mid = floor(($low + $high) / 2); | ||
996 | |||
997 | // if it's lower than the lower bound | ||
998 | if ($unicode < $blocks[$mid][0]) { | ||
999 | $high = $mid - 1; | ||
1000 | |||
1001 | // if it's higher than the upper bound | ||
1002 | } elseif ($unicode > $blocks[$mid][1]) { | ||
1003 | $low = $mid + 1; | ||
1004 | |||
1005 | // found it | ||
1006 | } else { | ||
1007 | return $blocks[$mid]; | ||
1008 | } | ||
1009 | } | ||
1010 | |||
1011 | // failed to find the block | ||
1012 | return -1; | ||
1013 | |||
1014 | // todo: differentiate when it's out of range or when it falls | ||
1015 | // into an unassigned range? | ||
1016 | } | ||
1017 | |||
1018 | /** | ||
1019 | * Brings up the unicode block database | ||
1020 | * | ||
1021 | * @access protected | ||
1022 | * @return array the database of unicode block definitions | ||
1023 | * @throws PEAR_Error | ||
1024 | */ | ||
1025 | function &_read_unicode_block_db() { | ||
1026 | // since the unicode definitions are always going to be the same, | ||
1027 | // might as well share the memory for the db with all other instances | ||
1028 | // of this class | ||
1029 | static $data; | ||
1030 | |||
1031 | if (!isset($data)) { | ||
1032 | $data = $this->_readdb($this->_unicode_db_filename); | ||
1033 | } | ||
1034 | |||
1035 | return $data; | ||
1036 | } | ||
1037 | |||
1038 | /** | ||
1039 | * Calculate the similarities between the language models | ||
1040 | * | ||
1041 | * Use this function to see how similar languages are to each other. | ||
1042 | * | ||
1043 | * If passed 2 language names, will return just those languages compared. | ||
1044 | * If passed 1 language name, will return that language compared to | ||
1045 | * all others. | ||
1046 | * If passed none, will return an array of every language model compared | ||
1047 | * to every other one. | ||
1048 | * | ||
1049 | * @access public | ||
1050 | * @param string $lang1 the name of the first language to be compared | ||
1051 | * @param string $lang2 the name of the second language to be compared | ||
1052 | * @return array scores of every language compared | ||
1053 | * or the score of just the provided languages | ||
1054 | * or null if one of the supplied languages does not exist | ||
1055 | * @throws PEAR_Error | ||
1056 | */ | ||
1057 | function languageSimilarity($lang1 = null, $lang2 = null) | ||
1058 | { | ||
1059 | if (!$this->_setup_ok($err)) { | ||
1060 | return $err; | ||
1061 | } | ||
1062 | |||
1063 | if ($lang1 != null) { | ||
1064 | $lang1 = strtolower($lang1); | ||
1065 | |||
1066 | // check if language model exists | ||
1067 | if (!isset($this->_lang_db[$lang1])) { | ||
1068 | return null; | ||
1069 | } | ||
1070 | |||
1071 | if ($lang2 != null) { | ||
1072 | |||
1073 | // can't only set the second param | ||
1074 | if ($lang1 == null) { | ||
1075 | return null; | ||
1076 | // check if language model exists | ||
1077 | } elseif (!isset($this->_lang_db[$lang2])) { | ||
1078 | return null; | ||
1079 | } | ||
1080 | |||
1081 | $lang2 = strtolower($lang2); | ||
1082 | |||
1083 | // compare just these two languages | ||
1084 | return $this->_normalize_score( | ||
1085 | $this->_distance( | ||
1086 | $this->_lang_db[$lang1], | ||
1087 | $this->_lang_db[$lang2] | ||
1088 | ) | ||
1089 | ); | ||
1090 | |||
1091 | |||
1092 | // compare just $lang1 to all languages | ||
1093 | } else { | ||
1094 | $return_arr = array(); | ||
1095 | foreach ($this->_lang_db as $key => $value) { | ||
1096 | if ($key != $lang1) { // don't compare a language to itself | ||
1097 | $return_arr[$key] = $this->_normalize_score( | ||
1098 | $this->_distance($this->_lang_db[$lang1], $value)); | ||
1099 | } | ||
1100 | } | ||
1101 | asort($return_arr); | ||
1102 | |||
1103 | return $return_arr; | ||
1104 | } | ||
1105 | |||
1106 | |||
1107 | // compare all languages to each other | ||
1108 | } else { | ||
1109 | $return_arr = array(); | ||
1110 | foreach (array_keys($this->_lang_db) as $lang1) { | ||
1111 | foreach (array_keys($this->_lang_db) as $lang2) { | ||
1112 | |||
1113 | // skip comparing languages to themselves | ||
1114 | if ($lang1 != $lang2) { | ||
1115 | |||
1116 | // don't re-calculate what's already been done | ||
1117 | if (isset($return_arr[$lang2][$lang1])) { | ||
1118 | |||
1119 | $return_arr[$lang1][$lang2] = | ||
1120 | $return_arr[$lang2][$lang1]; | ||
1121 | |||
1122 | // calculate | ||
1123 | } else { | ||
1124 | |||
1125 | $return_arr[$lang1][$lang2] = | ||
1126 | $this->_normalize_score( | ||
1127 | $this->_distance( | ||
1128 | $this->_lang_db[$lang1], | ||
1129 | $this->_lang_db[$lang2] | ||
1130 | ) | ||
1131 | ); | ||
1132 | |||
1133 | } | ||
1134 | } | ||
1135 | } | ||
1136 | } | ||
1137 | return $return_arr; | ||
1138 | } | ||
1139 | } | ||
1140 | |||
1141 | /** | ||
1142 | * Cluster known languages according to languageSimilarity() | ||
1143 | * | ||
1144 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | ||
1145 | * use, and it may disappear or its functionality may change in future | ||
1146 | * releases without notice. | ||
1147 | * | ||
1148 | * Uses a nearest neighbor technique to generate the maximum possible | ||
1149 | * number of dendograms from the similarity data. | ||
1150 | * | ||
1151 | * @access public | ||
1152 | * @return array language cluster data | ||
1153 | * @throws PEAR_Error | ||
1154 | * @see languageSimilarity() | ||
1155 | * @deprecated this function will eventually be removed and placed into | ||
1156 | * the model generation class | ||
1157 | */ | ||
1158 | function clusterLanguages() | ||
1159 | { | ||
1160 | // todo: set the maximum number of clusters | ||
1161 | |||
1162 | // setup check | ||
1163 | if (!$this->_setup_ok($err)) { | ||
1164 | return $err; | ||
1165 | } | ||
1166 | |||
1167 | // return cached result, if any | ||
1168 | if (isset($this->_clusters)) { | ||
1169 | return $this->_clusters; | ||
1170 | } | ||
1171 | |||
1172 | $langs = array_keys($this->_lang_db); | ||
1173 | |||
1174 | $arr = $this->languageSimilarity(); | ||
1175 | |||
1176 | sort($langs); | ||
1177 | |||
1178 | foreach ($langs as $lang) { | ||
1179 | if (!isset($this->_lang_db[$lang])) { | ||
1180 | throw new Exception("missing $lang!\n"); | ||
1181 | } | ||
1182 | } | ||
1183 | |||
1184 | // http://www.psychstat.missouristate.edu/multibook/mlt04m.html | ||
1185 | foreach ($langs as $old_key => $lang1) { | ||
1186 | $langs[$lang1] = $lang1; | ||
1187 | unset($langs[$old_key]); | ||
1188 | } | ||
1189 | |||
1190 | $i = 0; | ||
1191 | while (count($langs) > 2 && $i++ < 200) { | ||
1192 | $highest_score = -1; | ||
1193 | $highest_key1 = ''; | ||
1194 | $highest_key2 = ''; | ||
1195 | foreach ($langs as $lang1) { | ||
1196 | foreach ($langs as $lang2) { | ||
1197 | if ( $lang1 != $lang2 | ||
1198 | && $arr[$lang1][$lang2] > $highest_score) { | ||
1199 | $highest_score = $arr[$lang1][$lang2]; | ||
1200 | $highest_key1 = $lang1; | ||
1201 | $highest_key2 = $lang2; | ||
1202 | } | ||
1203 | } | ||
1204 | } | ||
1205 | |||
1206 | if (!$highest_key1) { | ||
1207 | // should not ever happen | ||
1208 | throw new Exception("no highest key? (step: $i)"); | ||
1209 | } | ||
1210 | |||
1211 | if ($highest_score == 0) { | ||
1212 | // languages are perfectly dissimilar | ||
1213 | break; | ||
1214 | } | ||
1215 | |||
1216 | // $highest_key1 and $highest_key2 are most similar | ||
1217 | $sum1 = array_sum($arr[$highest_key1]); | ||
1218 | $sum2 = array_sum($arr[$highest_key2]); | ||
1219 | |||
1220 | // use the score for the one that is most similar to the rest of | ||
1221 | // the field as the score for the group | ||
1222 | // todo: could try averaging or "centroid" method instead | ||
1223 | // seems like that might make more sense | ||
1224 | // actually nearest neighbor may be better for binary searching | ||
1225 | |||
1226 | |||
1227 | // for "Complete Linkage"/"furthest neighbor" | ||
1228 | // sign should be < | ||
1229 | // for "Single Linkage"/"nearest neighbor" method | ||
1230 | // should should be > | ||
1231 | // results seem to be pretty much the same with either method | ||
1232 | |||
1233 | // figure out which to delete and which to replace | ||
1234 | if ($sum1 > $sum2) { | ||
1235 | $replaceme = $highest_key1; | ||
1236 | $deleteme = $highest_key2; | ||
1237 | } else { | ||
1238 | $replaceme = $highest_key2; | ||
1239 | $deleteme = $highest_key1; | ||
1240 | } | ||
1241 | |||
1242 | $newkey = $replaceme . ':' . $deleteme; | ||
1243 | |||
1244 | // $replaceme is most similar to remaining languages | ||
1245 | // replace $replaceme with '$newkey', deleting $deleteme | ||
1246 | |||
1247 | // keep a record of which fork is really which language | ||
1248 | $really_lang = $replaceme; | ||
1249 | while (isset($really_map[$really_lang])) { | ||
1250 | $really_lang = $really_map[$really_lang]; | ||
1251 | } | ||
1252 | $really_map[$newkey] = $really_lang; | ||
1253 | |||
1254 | |||
1255 | // replace the best fitting key, delete the other | ||
1256 | foreach ($arr as $key1 => $arr2) { | ||
1257 | foreach ($arr2 as $key2 => $value2) { | ||
1258 | if ($key2 == $replaceme) { | ||
1259 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | ||
1260 | unset($arr[$key1][$key2]); | ||
1261 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | ||
1262 | } | ||
1263 | |||
1264 | if ($key1 == $replaceme) { | ||
1265 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | ||
1266 | unset($arr[$key1][$key2]); | ||
1267 | // replacing $arr[$key1][$key2] with $arr[$newkey][$key2] | ||
1268 | } | ||
1269 | |||
1270 | if ($key1 == $deleteme || $key2 == $deleteme) { | ||
1271 | // deleting $arr[$key1][$key2] | ||
1272 | unset($arr[$key1][$key2]); | ||
1273 | } | ||
1274 | } | ||
1275 | } | ||
1276 | |||
1277 | |||
1278 | unset($langs[$highest_key1]); | ||
1279 | unset($langs[$highest_key2]); | ||
1280 | $langs[$newkey] = $newkey; | ||
1281 | |||
1282 | |||
1283 | // some of these may be overkill | ||
1284 | $result_data[$newkey] = array( | ||
1285 | 'newkey' => $newkey, | ||
1286 | 'count' => $i, | ||
1287 | 'diff' => abs($sum1 - $sum2), | ||
1288 | 'score' => $highest_score, | ||
1289 | 'bestfit' => $replaceme, | ||
1290 | 'otherfit' => $deleteme, | ||
1291 | 'really' => $really_lang, | ||
1292 | ); | ||
1293 | } | ||
1294 | |||
1295 | $return_val = array( | ||
1296 | 'open_forks' => $langs, | ||
1297 | // the top level of clusters | ||
1298 | // clusters that are mutually exclusive | ||
1299 | // or specified by a specific maximum | ||
1300 | |||
1301 | 'fork_data' => $result_data, | ||
1302 | // data for each split | ||
1303 | |||
1304 | 'name_map' => $really_map, | ||
1305 | // which cluster is really which language | ||
1306 | // using the nearest neighbor technique, the cluster | ||
1307 | // inherits all of the properties of its most-similar member | ||
1308 | // this keeps track | ||
1309 | ); | ||
1310 | |||
1311 | |||
1312 | // saves the result in the object | ||
1313 | $this->_clusters = $return_val; | ||
1314 | |||
1315 | return $return_val; | ||
1316 | } | ||
1317 | |||
1318 | |||
1319 | /** | ||
1320 | * Perform an intelligent detection based on clusterLanguages() | ||
1321 | * | ||
1322 | * WARNING: this method is EXPERIMENTAL. It is not recommended for common | ||
1323 | * use, and it may disappear or its functionality may change in future | ||
1324 | * releases without notice. | ||
1325 | * | ||
1326 | * This compares the sample text to top the top level of clusters. If the | ||
1327 | * sample is similar to the cluster it will drop down and compare it to the | ||
1328 | * languages in the cluster, and so on until it hits a leaf node. | ||
1329 | * | ||
1330 | * this should find the language in considerably fewer compares | ||
1331 | * (the equivalent of a binary search), however clusterLanguages() is costly | ||
1332 | * and the loss of accuracy from this technique is significant. | ||
1333 | * | ||
1334 | * This method may need to be 'fuzzier' in order to become more accurate. | ||
1335 | * | ||
1336 | * This function could be more useful if the universe of possible languages | ||
1337 | * was very large, however in such cases some method of Bayesian inference | ||
1338 | * might be more helpful. | ||
1339 | * | ||
1340 | * @see clusterLanguages() | ||
1341 | * @access public | ||
1342 | * @param string $str input string | ||
1343 | * @return array language scores (only those compared) | ||
1344 | * @throws PEAR_Error | ||
1345 | */ | ||
1346 | function clusteredSearch($str) | ||
1347 | { | ||
1348 | |||
1349 | // input check | ||
1350 | if (!Text_LanguageDetect_Parser::validateString($str)) { | ||
1351 | return array(); | ||
1352 | } | ||
1353 | |||
1354 | // clusterLanguages() will return a cached result if possible | ||
1355 | // so it's safe to call it every time | ||
1356 | $result = $this->clusterLanguages(); | ||
1357 | |||
1358 | $dendogram_start = $result['open_forks']; | ||
1359 | $dendogram_data = $result['fork_data']; | ||
1360 | $dendogram_alias = $result['name_map']; | ||
1361 | |||
1362 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | ||
1363 | $sample_obj->prepareTrigram(); | ||
1364 | $sample_obj->setPadStart(!$this->_perl_compatible); | ||
1365 | $sample_obj->analyze(); | ||
1366 | $sample_result = $sample_obj->getTrigramRanks(); | ||
1367 | $sample_count = count($sample_result); | ||
1368 | |||
1369 | // input check | ||
1370 | if ($sample_count == 0) { | ||
1371 | return array(); | ||
1372 | } | ||
1373 | |||
1374 | $i = 0; // counts the number of steps | ||
1375 | |||
1376 | foreach ($dendogram_start as $lang) { | ||
1377 | if (isset($dendogram_alias[$lang])) { | ||
1378 | $lang_key = $dendogram_alias[$lang]; | ||
1379 | } else { | ||
1380 | $lang_key = $lang; | ||
1381 | } | ||
1382 | |||
1383 | $scores[$lang] = $this->_normalize_score( | ||
1384 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | ||
1385 | $sample_count); | ||
1386 | |||
1387 | $i++; | ||
1388 | } | ||
1389 | |||
1390 | if ($this->_perl_compatible) { | ||
1391 | asort($scores); | ||
1392 | } else { | ||
1393 | arsort($scores); | ||
1394 | } | ||
1395 | |||
1396 | $top_score = current($scores); | ||
1397 | $top_key = key($scores); | ||
1398 | |||
1399 | // of starting forks, $top_key is the most similar to the sample | ||
1400 | |||
1401 | $cur_key = $top_key; | ||
1402 | while (isset($dendogram_data[$cur_key])) { | ||
1403 | $lang1 = $dendogram_data[$cur_key]['bestfit']; | ||
1404 | $lang2 = $dendogram_data[$cur_key]['otherfit']; | ||
1405 | foreach (array($lang1, $lang2) as $lang) { | ||
1406 | if (isset($dendogram_alias[$lang])) { | ||
1407 | $lang_key = $dendogram_alias[$lang]; | ||
1408 | } else { | ||
1409 | $lang_key = $lang; | ||
1410 | } | ||
1411 | |||
1412 | $scores[$lang] = $this->_normalize_score( | ||
1413 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | ||
1414 | $sample_count); | ||
1415 | |||
1416 | //todo: does not need to do same comparison again | ||
1417 | } | ||
1418 | |||
1419 | $i++; | ||
1420 | |||
1421 | if ($scores[$lang1] > $scores[$lang2]) { | ||
1422 | $cur_key = $lang1; | ||
1423 | $loser_key = $lang2; | ||
1424 | } else { | ||
1425 | $cur_key = $lang2; | ||
1426 | $loser_key = $lang1; | ||
1427 | } | ||
1428 | |||
1429 | $diff = $scores[$cur_key] - $scores[$loser_key]; | ||
1430 | |||
1431 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | ||
1432 | // over $loser_key ({$dendogram_alias[$loser_key]}) | ||
1433 | // with a difference of $diff | ||
1434 | } | ||
1435 | |||
1436 | // found result in $i compares | ||
1437 | |||
1438 | // rather than sorting the result, preserve it so that you can see | ||
1439 | // which paths the algorithm decided to take along the tree | ||
1440 | |||
1441 | // but sometimes the last item is only the second highest | ||
1442 | if ( ($this->_perl_compatible && (end($scores) > prev($scores))) | ||
1443 | || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { | ||
1444 | |||
1445 | $real_last_score = current($scores); | ||
1446 | $real_last_key = key($scores); | ||
1447 | |||
1448 | // swaps the 2nd-to-last item for the last item | ||
1449 | unset($scores[$real_last_key]); | ||
1450 | $scores[$real_last_key] = $real_last_score; | ||
1451 | } | ||
1452 | |||
1453 | |||
1454 | if (!$this->_perl_compatible) { | ||
1455 | $scores = array_reverse($scores, true); | ||
1456 | // second param requires php > 4.0.3 | ||
1457 | } | ||
1458 | |||
1459 | return $scores; | ||
1460 | } | ||
1461 | |||
1462 | /** | ||
1463 | * ut8-safe strlen() | ||
1464 | * | ||
1465 | * Returns the numbers of characters (not bytes) in a utf8 string | ||
1466 | * | ||
1467 | * @static | ||
1468 | * @access public | ||
1469 | * @param string $str string to get the length of | ||
1470 | * @return int number of chars | ||
1471 | */ | ||
1472 | function utf8strlen($str) | ||
1473 | { | ||
1474 | // utf8_decode() will convert unknown chars to '?', which is actually | ||
1475 | // ideal for counting. | ||
1476 | |||
1477 | return strlen(utf8_decode($str)); | ||
1478 | |||
1479 | // idea stolen from dokuwiki | ||
1480 | } | ||
1481 | |||
1482 | /** | ||
1483 | * Returns the unicode value of a utf8 char | ||
1484 | * | ||
1485 | * @access protected | ||
1486 | * @param string $char a utf8 (possibly multi-byte) char | ||
1487 | * @return int unicode value or -1 if malformatted | ||
1488 | */ | ||
1489 | function _utf8char2unicode($char) { | ||
1490 | |||
1491 | // strlen() here will actually get the binary length of a single char | ||
1492 | switch (strlen($char)) { | ||
1493 | |||
1494 | // for a reference, see http://en.wikipedia.org/wiki/UTF-8 | ||
1495 | |||
1496 | case 1: | ||
1497 | // normal ASCII-7 byte | ||
1498 | // 0xxxxxxx --> 0xxxxxxx | ||
1499 | return ord($char{0}); | ||
1500 | |||
1501 | case 2: | ||
1502 | // 2 byte unicode | ||
1503 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | ||
1504 | $z = (ord($char{0}) & 0x000001F) << 6; | ||
1505 | $x = (ord($char{1}) & 0x0000003F); | ||
1506 | |||
1507 | return ($z | $x); | ||
1508 | |||
1509 | case 3: | ||
1510 | // 3 byte unicode | ||
1511 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | ||
1512 | $z = (ord($char{0}) & 0x0000000F) << 12; | ||
1513 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | ||
1514 | $x2 = (ord($char{2}) & 0x0000003F); | ||
1515 | |||
1516 | return ($z | $x1 | $x2); | ||
1517 | |||
1518 | case 4: | ||
1519 | // 4 byte unicode | ||
1520 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | ||
1521 | // 000zzzzz xxxxxxxx xxxxxxxx | ||
1522 | $z1 = (ord($char{0}) & 0x00000007) << 18; | ||
1523 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | ||
1524 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | ||
1525 | $x2 = (ord($char{3}) & 0x0000003F); | ||
1526 | |||
1527 | return ($z1 | $z2 | $x1 | $x2); | ||
1528 | |||
1529 | default: | ||
1530 | // error: malformatted char? | ||
1531 | return -1; | ||
1532 | } | ||
1533 | } | ||
1534 | |||
1535 | /** | ||
1536 | * utf8-safe fast character iterator | ||
1537 | * | ||
1538 | * Will get the next character starting from $counter, which will then be | ||
1539 | * incremented. If a multi-byte char the bytes will be concatenated and | ||
1540 | * $counter will be incremeted by the number of bytes in the char. | ||
1541 | * | ||
1542 | * @access private | ||
1543 | * @param string &$str the string being iterated over | ||
1544 | * @param int &$counter the iterator, will increment by reference | ||
1545 | * @param bool $special_convert whether to do special conversions | ||
1546 | * @return char the next (possibly multi-byte) char from $counter | ||
1547 | */ | ||
1548 | function _next_char(&$str, &$counter, $special_convert = false) | ||
1549 | { | ||
1550 | |||
1551 | $char = $str{$counter++}; | ||
1552 | $ord = ord($char); | ||
1553 | |||
1554 | // for a description of the utf8 system see | ||
1555 | // http://www.phpclasses.org/browse/file/5131.html | ||
1556 | |||
1557 | // normal ascii one byte char | ||
1558 | if ($ord <= 127) { | ||
1559 | |||
1560 | // special conversions needed for this package | ||
1561 | // (that only apply to regular ascii characters) | ||
1562 | // lower case, and convert all non-alphanumeric characters | ||
1563 | // other than "'" to space | ||
1564 | if ($special_convert && $char != ' ' && $char != "'") { | ||
1565 | if ($ord >= 65 && $ord <= 90) { // A-Z | ||
1566 | $char = chr($ord + 32); // lower case | ||
1567 | } elseif ($ord < 97 || $ord > 122) { // NOT a-z | ||
1568 | $char = ' '; // convert to space | ||
1569 | } | ||
1570 | } | ||
1571 | |||
1572 | return $char; | ||
1573 | |||
1574 | // multi-byte chars | ||
1575 | } elseif ($ord >> 5 == 6) { // two-byte char | ||
1576 | $nextchar = $str{$counter++}; // get next byte | ||
1577 | |||
1578 | // lower-casing of non-ascii characters is still incomplete | ||
1579 | |||
1580 | if ($special_convert) { | ||
1581 | // lower case latin accented characters | ||
1582 | if ($ord == 195) { | ||
1583 | $nextord = ord($nextchar); | ||
1584 | $nextord_adj = $nextord + 64; | ||
1585 | // for a reference, see | ||
1586 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | ||
1587 | |||
1588 | // À - Þ but not × | ||
1589 | if ( $nextord_adj >= 192 | ||
1590 | && $nextord_adj <= 222 | ||
1591 | && $nextord_adj != 215) { | ||
1592 | |||
1593 | $nextchar = chr($nextord + 32); | ||
1594 | } | ||
1595 | |||
1596 | // lower case cyrillic alphabet | ||
1597 | } elseif ($ord == 208) { | ||
1598 | $nextord = ord($nextchar); | ||
1599 | // if A - Pe | ||
1600 | if ($nextord >= 144 && $nextord <= 159) { | ||
1601 | // lower case | ||
1602 | $nextchar = chr($nextord + 32); | ||
1603 | |||
1604 | // if Er - Ya | ||
1605 | } elseif ($nextord >= 160 && $nextord <= 175) { | ||
1606 | // lower case | ||
1607 | $char = chr(209); // == $ord++ | ||
1608 | $nextchar = chr($nextord - 32); | ||
1609 | } | ||
1610 | } | ||
1611 | } | ||
1612 | |||
1613 | // tag on next byte | ||
1614 | return $char . $nextchar; | ||
1615 | |||
1616 | } elseif ($ord >> 4 == 14) { // three-byte char | ||
1617 | |||
1618 | // tag on next 2 bytes | ||
1619 | return $char . $str{$counter++} . $str{$counter++}; | ||
1620 | |||
1621 | } elseif ($ord >> 3 == 30) { // four-byte char | ||
1622 | |||
1623 | // tag on next 3 bytes | ||
1624 | return $char . $str{$counter++} . $str{$counter++} . $str{$counter++}; | ||
1625 | |||
1626 | } else { | ||
1627 | // error? | ||
1628 | } | ||
1629 | } | ||
1630 | |||
1631 | } | ||
1632 | |||
1633 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | ||
1634 | |||
1635 | ?> | ||