aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/libraries/language-detect
diff options
context:
space:
mode:
Diffstat (limited to 'inc/3rdparty/libraries/language-detect')
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect.php992
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php57
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php339
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php (renamed from inc/3rdparty/libraries/language-detect/Parser.php)19
4 files changed, 927 insertions, 480 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
index 09b11546..382d869c 100644
--- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
@@ -6,23 +6,24 @@
6 * Attempts to detect the language of a sample of text by correlating ranked 6 * Attempts to detect the language of a sample of text by correlating ranked
7 * 3-gram frequencies to a table of 3-gram frequencies of known languages. 7 * 3-gram frequencies to a table of 3-gram frequencies of known languages.
8 * 8 *
9 * Implements a version of a technique originally proposed by Cavnar & Trenkle 9 * Implements a version of a technique originally proposed by Cavnar & Trenkle
10 * (1994): "N-Gram-Based Text Categorization" 10 * (1994): "N-Gram-Based Text Categorization"
11 * 11 *
12 * PHP versions 4 and 5 12 * PHP version 5
13 * 13 *
14 * @category Text 14 * @category Text
15 * @package Text_LanguageDetect 15 * @package Text_LanguageDetect
16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
17 * @copyright 2005-2006 Nicholas Pisarro 17 * @copyright 2005-2006 Nicholas Pisarro
18 * @license http://www.debian.org/misc/bsd.license BSD 18 * @license http://www.debian.org/misc/bsd.license BSD
19 * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ 19 * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
20 * @link http://pear.php.net/package/Text_LanguageDetect/ 20 * @link http://pear.php.net/package/Text_LanguageDetect/
21 * @link http://langdetect.blogspot.com/ 21 * @link http://langdetect.blogspot.com/
22 */ 22 */
23 23
24//require_once 'PEAR.php'; 24require_once 'LanguageDetect/Exception.php';
25require_once 'Parser.php'; 25require_once 'LanguageDetect/Parser.php';
26require_once 'LanguageDetect/ISO639.php';
26 27
27/** 28/**
28 * Language detection class 29 * Language detection class
@@ -41,9 +42,10 @@ require_once 'Parser.php';
41 * 42 *
42 * echo "Supported languages:\n"; 43 * echo "Supported languages:\n";
43 * 44 *
44 * $langs = $l->getLanguages(); 45 * try {
45 * if (PEAR::isError($langs)) { 46 * $langs = $l->getLanguages();
46 * die($langs->getMessage()); 47 * } catch (Text_LanguageDetect_Exception $e) {
48 * die($e->getMessage());
47 * } 49 * }
48 * 50 *
49 * sort($langs); 51 * sort($langs);
@@ -54,38 +56,38 @@ require_once 'Parser.php';
54 * } 56 * }
55 * </code> 57 * </code>
56 * 58 *
57 * @category Text 59 * @category Text
58 * @package Text_LanguageDetect 60 * @package Text_LanguageDetect
59 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 61 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
60 * @copyright 2005 Nicholas Pisarro 62 * @copyright 2005 Nicholas Pisarro
61 * @license http://www.debian.org/misc/bsd.license BSD 63 * @license http://www.debian.org/misc/bsd.license BSD
62 * @version Release: @package_version@ 64 * @version Release: @package_version@
63 * @todo allow users to generate their own language models 65 * @link http://pear.php.net/package/Text_LanguageDetect/
66 * @todo allow users to generate their own language models
64 */ 67 */
65
66class Text_LanguageDetect 68class Text_LanguageDetect
67{ 69{
68 /** 70 /**
69 * The filename that stores the trigram data for the detector 71 * The filename that stores the trigram data for the detector
70 * 72 *
71 * If this value starts with a slash (/) or a dot (.) the value of 73 * If this value starts with a slash (/) or a dot (.) the value of
72 * $this->_data_dir will be ignored 74 * $this->_data_dir will be ignored
73 * 75 *
74 * @var string 76 * @var string
75 * @access private 77 * @access private
76 */ 78 */
77 var $_db_filename = './lang.dat'; 79 var $_db_filename = 'lang.dat';
78 80
79 /** 81 /**
80 * The filename that stores the unicode block definitions 82 * The filename that stores the unicode block definitions
81 * 83 *
82 * If this value starts with a slash (/) or a dot (.) the value of 84 * If this value starts with a slash (/) or a dot (.) the value of
83 * $this->_data_dir will be ignored 85 * $this->_data_dir will be ignored
84 * 86 *
85 * @var string 87 * @var string
86 * @access private 88 * @access private
87 */ 89 */
88 var $_unicode_db_filename = './unicode_blocks.dat'; 90 var $_unicode_db_filename = 'unicode_blocks.dat';
89 91
90 /** 92 /**
91 * The data directory 93 * The data directory
@@ -99,11 +101,8 @@ class Text_LanguageDetect
99 101
100 /** 102 /**
101 * The trigram data for comparison 103 * The trigram data for comparison
102 *
103 * Will be loaded on start from $this->_db_filename
104 * 104 *
105 * May be set to a PEAR_Error object if there is an error during its 105 * Will be loaded on start from $this->_db_filename
106 * initialization
107 * 106 *
108 * @var array 107 * @var array
109 * @access private 108 * @access private
@@ -120,7 +119,7 @@ class Text_LanguageDetect
120 119
121 /** 120 /**
122 * The size of the trigram data arrays 121 * The size of the trigram data arrays
123 * 122 *
124 * @var int 123 * @var int
125 * @access private 124 * @access private
126 */ 125 */
@@ -140,7 +139,7 @@ class Text_LanguageDetect
140 139
141 /** 140 /**
142 * Whether or not to simulate perl's Language::Guess exactly 141 * Whether or not to simulate perl's Language::Guess exactly
143 * 142 *
144 * @access private 143 * @access private
145 * @var bool 144 * @var bool
146 * @see setPerlCompatible() 145 * @see setPerlCompatible()
@@ -165,18 +164,24 @@ class Text_LanguageDetect
165 var $_clusters; 164 var $_clusters;
166 165
167 /** 166 /**
167 * Which type of "language names" are accepted and returned:
168 *
169 * 0 - language name ("english")
170 * 2 - 2-letter ISO 639-1 code ("en")
171 * 3 - 3-letter ISO 639-2 code ("eng")
172 */
173 var $_name_mode = 0;
174
175 /**
168 * Constructor 176 * Constructor
169 * 177 *
170 * Will attempt to load the language database. If it fails, you will get 178 * Will attempt to load the language database. If it fails, you will get
171 * a PEAR_Error object returned when you try to use detect() 179 * an exception.
172 *
173 */ 180 */
174 function Text_LanguageDetect($db=null, $unicode_db=null) 181 function __construct()
175 { 182 {
176 if (isset($db)) $this->_db_filename = $db;
177 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
178
179 $data = $this->_readdb($this->_db_filename); 183 $data = $this->_readdb($this->_db_filename);
184 $this->_checkTrigram($data['trigram']);
180 $this->_lang_db = $data['trigram']; 185 $this->_lang_db = $data['trigram'];
181 186
182 if (isset($data['trigram-unicodemap'])) { 187 if (isset($data['trigram-unicodemap'])) {
@@ -186,29 +191,32 @@ class Text_LanguageDetect
186 // Not yet implemented: 191 // Not yet implemented:
187 if (isset($data['trigram-clusters'])) { 192 if (isset($data['trigram-clusters'])) {
188 $this->_clusters = $data['trigram-clusters']; 193 $this->_clusters = $data['trigram-clusters'];
189 } 194 }
190 } 195 }
191 196
192 /** 197 /**
193 * Returns the path to the location of the database 198 * Returns the path to the location of the database
194 * 199 *
195 * @access private 200 * @param string $fname File name to load
196 * @return string expected path to the language model database 201 *
202 * @return string expected path to the language model database
203 * @access private
197 */ 204 */
198 function _get_data_loc($fname) 205 function _get_data_loc($fname)
199 { 206 {
200 return $fname; 207 return dirname(__FILE__).'/'.$fname;
201 } 208 }
202 209
203 /** 210 /**
204 * Loads the language trigram database from filename 211 * Loads the language trigram database from filename
205 * 212 *
206 * Trigram datbase should be a serialize()'d array 213 * Trigram datbase should be a serialize()'d array
207 * 214 *
208 * @access private 215 * @param string $fname the filename where the data is stored
209 * @param string $fname the filename where the data is stored 216 *
210 * @return array the language model data 217 * @return array the language model data
211 * @throws PEAR_Error 218 * @throws Text_LanguageDetect_Exception
219 * @access private
212 */ 220 */
213 function _readdb($fname) 221 function _readdb($fname)
214 { 222 {
@@ -217,79 +225,74 @@ class Text_LanguageDetect
217 225
218 // input check 226 // input check
219 if (!file_exists($fname)) { 227 if (!file_exists($fname)) {
220 throw new Exception('Language database does not exist.'); 228 throw new Text_LanguageDetect_Exception(
229 'Language database does not exist: ' . $fname,
230 Text_LanguageDetect_Exception::DB_NOT_FOUND
231 );
221 } elseif (!is_readable($fname)) { 232 } elseif (!is_readable($fname)) {
222 throw new Exception('Language database is not readable.'); 233 throw new Text_LanguageDetect_Exception(
234 'Language database is not readable: ' . $fname,
235 Text_LanguageDetect_Exception::DB_NOT_READABLE
236 );
223 } 237 }
224 238
225 if (function_exists('file_get_contents')) { 239 return unserialize(file_get_contents($fname));
226 return unserialize(file_get_contents($fname));
227 } else {
228 // if you don't have file_get_contents(),
229 // then this is the next fastest way
230 ob_start();
231 readfile($fname);
232 $contents = ob_get_contents();
233 ob_end_clean();
234 return unserialize($contents);
235 }
236 } 240 }
237 241
238 242
239 /** 243 /**
240 * Checks if this object is ready to detect languages 244 * Checks if this object is ready to detect languages
241 * 245 *
242 * @access private 246 * @param array $trigram Trigram data from database
243 * @param mixed &$err error object to be returned by reference, if any 247 *
244 * @return bool true if no errors 248 * @return void
249 * @access private
245 */ 250 */
246 function _setup_ok(&$err) 251 function _checkTrigram($trigram)
247 { 252 {
248 if (!is_array($this->_lang_db)) { 253 if (!is_array($trigram)) {
249 if (ini_get('magic_quotes_runtime')) { 254 if (ini_get('magic_quotes_runtime')) {
250 throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); 255 throw new Text_LanguageDetect_Exception(
251 } else { 256 'Error loading database. Try turning magic_quotes_runtime off.',
252 throw new Exception('Language database is not an array.'); 257 Text_LanguageDetect_Exception::MAGIC_QUOTES
258 );
253 } 259 }
254 return false; 260 throw new Text_LanguageDetect_Exception(
255 261 'Language database is not an array.',
256 } elseif (empty($this->_lang_db)) { 262 Text_LanguageDetect_Exception::DB_NOT_ARRAY
257 throw new Exception('Language database has no elements.'); 263 );
258 return false; 264 } elseif (empty($trigram)) {
259 265 throw new Text_LanguageDetect_Exception(
260 } else { 266 'Language database has no elements.',
261 return true; 267 Text_LanguageDetect_Exception::DB_EMPTY
268 );
262 } 269 }
263 } 270 }
264 271
265 /** 272 /**
266 * Omits languages 273 * Omits languages
267 * 274 *
268 * Pass this function the name of or an array of names of 275 * Pass this function the name of or an array of names of
269 * languages that you don't want considered 276 * languages that you don't want considered
270 * 277 *
271 * If you're only expecting a limited set of languages, this can greatly 278 * If you're only expecting a limited set of languages, this can greatly
272 * speed up processing 279 * speed up processing
273 * 280 *
274 * @access public 281 * @param mixed $omit_list language name or array of names to omit
275 * @param mixed $omit_list language name or array of names to omit 282 * @param bool $include_only if true will include (rather than
276 * @param bool $include_only if true will include (rather than 283 * exclude) only those in the list
277 * exclude) only those in the list 284 *
278 * @return int number of languages successfully deleted 285 * @return int number of languages successfully deleted
279 * @throws PEAR_Error 286 * @throws Text_LanguageDetect_Exception
280 */ 287 */
281 function omitLanguages($omit_list, $include_only = false) 288 public function omitLanguages($omit_list, $include_only = false)
282 { 289 {
283
284 // setup check
285 if (!$this->_setup_ok($err)) {
286 return $err;
287 }
288
289 $deleted = 0; 290 $deleted = 0;
290 291
291 // deleting the given languages 292 $omit_list = $this->_convertFromNameMode($omit_list);
293
292 if (!$include_only) { 294 if (!$include_only) {
295 // deleting the given languages
293 if (!is_array($omit_list)) { 296 if (!is_array($omit_list)) {
294 $omit_list = strtolower($omit_list); // case desensitize 297 $omit_list = strtolower($omit_list); // case desensitize
295 if (isset($this->_lang_db[$omit_list])) { 298 if (isset($this->_lang_db[$omit_list])) {
@@ -301,12 +304,12 @@ class Text_LanguageDetect
301 if (isset($this->_lang_db[$omit_lang])) { 304 if (isset($this->_lang_db[$omit_lang])) {
302 unset($this->_lang_db[$omit_lang]); 305 unset($this->_lang_db[$omit_lang]);
303 $deleted++; 306 $deleted++;
304 } 307 }
305 } 308 }
306 } 309 }
307 310
308 // deleting all except the given languages
309 } else { 311 } else {
312 // deleting all except the given languages
310 if (!is_array($omit_list)) { 313 if (!is_array($omit_list)) {
311 $omit_list = array($omit_list); 314 $omit_list = array($omit_list);
312 } 315 }
@@ -327,7 +330,7 @@ class Text_LanguageDetect
327 // reset the cluster cache if the number of languages changes 330 // reset the cluster cache if the number of languages changes
328 // this will then have to be recalculated 331 // this will then have to be recalculated
329 if (isset($this->_clusters) && $deleted > 0) { 332 if (isset($this->_clusters) && $deleted > 0) {
330 unset($this->_clusters); 333 $this->_clusters = null;
331 } 334 }
332 335
333 return $deleted; 336 return $deleted;
@@ -339,49 +342,40 @@ class Text_LanguageDetect
339 * 342 *
340 * @access public 343 * @access public
341 * @return int the number of languages 344 * @return int the number of languages
342 * @throws PEAR_Error 345 * @throws Text_LanguageDetect_Exception
343 */ 346 */
344 function getLanguageCount() 347 function getLanguageCount()
345 { 348 {
346 if (!$this->_setup_ok($err)) { 349 return count($this->_lang_db);
347 return $err;
348 } else {
349 return count($this->_lang_db);
350 }
351 } 350 }
352 351
353 /** 352 /**
354 * Returns true if a given language exists 353 * Checks if the language with the given name exists in the database
355 * 354 *
356 * If passed an array of names, will return true only if all exist 355 * @param mixed $lang Language name or array of language names
357 * 356 *
358 * @access public 357 * @return bool true if language model exists
359 * @param mixed $lang language name or array of language names
360 * @return bool true if language model exists
361 * @throws PEAR_Error
362 */ 358 */
363 function languageExists($lang) 359 public function languageExists($lang)
364 { 360 {
365 if (!$this->_setup_ok($err)) { 361 $lang = $this->_convertFromNameMode($lang);
366 return $err;
367 } else {
368 // string
369 if (is_string($lang)) {
370 return isset($this->_lang_db[strtolower($lang)]);
371
372 // array
373 } elseif (is_array($lang)) {
374 foreach ($lang as $test_lang) {
375 if (!isset($this->_lang_db[strtolower($test_lang)])) {
376 return false;
377 }
378 }
379 return true;
380 362
381 // other (error) 363 if (is_string($lang)) {
382 } else { 364 return isset($this->_lang_db[strtolower($lang)]);
383 throw new Exception('Unknown type passed to languageExists()'); 365
366 } elseif (is_array($lang)) {
367 foreach ($lang as $test_lang) {
368 if (!isset($this->_lang_db[strtolower($test_lang)])) {
369 return false;
370 }
384 } 371 }
372 return true;
373
374 } else {
375 throw new Text_LanguageDetect_Exception(
376 'Unsupported parameter type passed to languageExists()',
377 Text_LanguageDetect_Exception::PARAM_TYPE
378 );
385 } 379 }
386 } 380 }
387 381
@@ -389,25 +383,24 @@ class Text_LanguageDetect
389 * Returns the list of detectable languages 383 * Returns the list of detectable languages
390 * 384 *
391 * @access public 385 * @access public
392 * @return array the names of the languages known to this object 386 * @return array the names of the languages known to this object<<<<<<<
393 * @throws PEAR_Error 387 * @throws Text_LanguageDetect_Exception
394 */ 388 */
395 function getLanguages() 389 function getLanguages()
396 { 390 {
397 if (!$this->_setup_ok($err)) { 391 return $this->_convertToNameMode(
398 return $err; 392 array_keys($this->_lang_db)
399 } else { 393 );
400 return array_keys($this->_lang_db);
401 }
402 } 394 }
403 395
404 /** 396 /**
405 * Make this object behave like Language::Guess 397 * Make this object behave like Language::Guess
406 * 398 *
407 * @access public 399 * @param bool $setting false to turn off perl compatibility
408 * @param bool $setting false to turn off perl compatibility 400 *
401 * @return void
409 */ 402 */
410 function setPerlCompatible($setting = true) 403 public function setPerlCompatible($setting = true)
411 { 404 {
412 if (is_bool($setting)) { // input check 405 if (is_bool($setting)) { // input check
413 $this->_perl_compatible = $setting; 406 $this->_perl_compatible = $setting;
@@ -422,6 +415,21 @@ class Text_LanguageDetect
422 } 415 }
423 416
424 /** 417 /**
418 * Sets the way how language names are accepted and returned.
419 *
420 * @param integer $name_mode One of the following modes:
421 * 0 - language name ("english")
422 * 2 - 2-letter ISO 639-1 code ("en")
423 * 3 - 3-letter ISO 639-2 code ("eng")
424 *
425 * @return void
426 */
427 function setNameMode($name_mode)
428 {
429 $this->_name_mode = $name_mode;
430 }
431
432 /**
425 * Whether to use unicode block ranges in detection 433 * Whether to use unicode block ranges in detection
426 * 434 *
427 * Should speed up most detections if turned on (detault is on). In some 435 * Should speed up most detections if turned on (detault is on). In some
@@ -429,10 +437,11 @@ class Text_LanguageDetect
429 * in languages that use latin scripts. In other cases it should speed up 437 * in languages that use latin scripts. In other cases it should speed up
430 * detection noticeably. 438 * detection noticeably.
431 * 439 *
432 * @access public 440 * @param bool $setting false to turn off
433 * @param bool $setting false to turn off 441 *
442 * @return void
434 */ 443 */
435 function useUnicodeBlocks($setting = true) 444 public function useUnicodeBlocks($setting = true)
436 { 445 {
437 if (is_bool($setting)) { 446 if (is_bool($setting)) {
438 $this->_use_unicode_narrowing = $setting; 447 $this->_use_unicode_narrowing = $setting;
@@ -442,15 +451,15 @@ class Text_LanguageDetect
442 /** 451 /**
443 * Converts a piece of text into trigrams 452 * Converts a piece of text into trigrams
444 * 453 *
445 * Superceded by the Text_LanguageDetect_Parser class 454 * @param string $text text to convert
446 * 455 *
447 * @access private 456 * @return array array of trigram frequencies
448 * @param string $text text to convert 457 * @access private
449 * @return array array of trigram frequencies 458 * @deprecated Superceded by the Text_LanguageDetect_Parser class
450 */ 459 */
451 function _trigram($text) 460 function _trigram($text)
452 { 461 {
453 $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); 462 $s = new Text_LanguageDetect_Parser($text);
454 $s->prepareTrigram(); 463 $s->prepareTrigram();
455 $s->prepareUnicode(false); 464 $s->prepareUnicode(false);
456 $s->setPadStart(!$this->_perl_compatible); 465 $s->setPadStart(!$this->_perl_compatible);
@@ -463,11 +472,12 @@ class Text_LanguageDetect
463 * 472 *
464 * Thresholds (cuts off) the list at $this->_threshold 473 * Thresholds (cuts off) the list at $this->_threshold
465 * 474 *
466 * @access protected 475 * @param array $arr array of trigram
467 * @param array $arr array of trgram 476 *
468 * @return array ranks of trigrams 477 * @return array ranks of trigrams
478 * @access protected
469 */ 479 */
470 function _arr_rank(&$arr) 480 function _arr_rank($arr)
471 { 481 {
472 482
473 // sorts alphabetically first as a standard way of breaking rank ties 483 // sorts alphabetically first as a standard way of breaking rank ties
@@ -494,14 +504,17 @@ class Text_LanguageDetect
494 504
495 /** 505 /**
496 * Sorts an array by value breaking ties alphabetically 506 * Sorts an array by value breaking ties alphabetically
497 * 507 *
498 * @access private 508 * @param array &$arr the array to sort
499 * @param array &$arr the array to sort 509 *
510 * @return void
511 * @access private
500 */ 512 */
501 function _bub_sort(&$arr) 513 function _bub_sort(&$arr)
502 { 514 {
503 // should do the same as this perl statement: 515 // should do the same as this perl statement:
504 // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } 516 // sort { $trigrams{$b} == $trigrams{$a}
517 // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
505 518
506 // needs to sort by both key and value at once 519 // needs to sort by both key and value at once
507 // using the key to break ties for the value 520 // using the key to break ties for the value
@@ -528,13 +541,14 @@ class Text_LanguageDetect
528 /** 541 /**
529 * Sort function used by bubble sort 542 * Sort function used by bubble sort
530 * 543 *
531 * Callback function for usort(). 544 * Callback function for usort().
532 * 545 *
533 * @access private 546 * @param array $a first param passed by usort()
534 * @param array first param passed by usort() 547 * @param array $b second param passed by usort()
535 * @param array second param passed by usort() 548 *
536 * @return int 1 if $a is greater, -1 if not 549 * @return int 1 if $a is greater, -1 if not
537 * @see _bub_sort() 550 * @see _bub_sort()
551 * @access private
538 */ 552 */
539 function _sort_func($a, $b) 553 function _sort_func($a, $b)
540 { 554 {
@@ -542,12 +556,12 @@ class Text_LanguageDetect
542 list($a_key, $a_value) = $a; 556 list($a_key, $a_value) = $a;
543 list($b_key, $b_value) = $b; 557 list($b_key, $b_value) = $b;
544 558
545 // if the values are the same, break ties using the key
546 if ($a_value == $b_value) { 559 if ($a_value == $b_value) {
560 // if the values are the same, break ties using the key
547 return strcmp($a_key, $b_key); 561 return strcmp($a_key, $b_key);
548 562
549 // if not, just sort normally
550 } else { 563 } else {
564 // if not, just sort normally
551 if ($a_value > $b_value) { 565 if ($a_value > $b_value) {
552 return -1; 566 return -1;
553 } else { 567 } else {
@@ -559,23 +573,24 @@ class Text_LanguageDetect
559 } 573 }
560 574
561 /** 575 /**
562 * Calculates a linear rank-order distance statistic between two sets of 576 * Calculates a linear rank-order distance statistic between two sets of
563 * ranked trigrams 577 * ranked trigrams
564 * 578 *
565 * Sums the differences in rank for each trigram. If the trigram does not 579 * Sums the differences in rank for each trigram. If the trigram does not
566 * appear in both, consider it a difference of $this->_threshold. 580 * appear in both, consider it a difference of $this->_threshold.
567 * 581 *
568 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite 582 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
569 * its simplicity it has been shown to be highly accurate for language 583 * its simplicity it has been shown to be highly accurate for language
570 * identification tasks. 584 * identification tasks.
571 * 585 *
572 * @access private 586 * @param array $arr1 the reference set of trigram ranks
573 * @param array $arr1 the reference set of trigram ranks 587 * @param array $arr2 the target set of trigram ranks
574 * @param array $arr2 the target set of trigram ranks 588 *
575 * @return int the sum of the differences between the ranks of 589 * @return int the sum of the differences between the ranks of
576 * the two trigram sets 590 * the two trigram sets
591 * @access private
577 */ 592 */
578 function _distance(&$arr1, &$arr2) 593 function _distance($arr1, $arr2)
579 { 594 {
580 $sumdist = 0; 595 $sumdist = 0;
581 596
@@ -598,14 +613,15 @@ class Text_LanguageDetect
598 613
599 /** 614 /**
600 * Normalizes the score returned by _distance() 615 * Normalizes the score returned by _distance()
601 * 616 *
602 * Different if perl compatible or not 617 * Different if perl compatible or not
603 * 618 *
604 * @access private 619 * @param int $score the score from _distance()
605 * @param int $score the score from _distance() 620 * @param int $base_count the number of trigrams being considered
606 * @param int $base_count the number of trigrams being considered 621 *
607 * @return float the normalized score 622 * @return float the normalized score
608 * @see _distance() 623 * @see _distance()
624 * @access private
609 */ 625 */
610 function _normalize_score($score, $base_count = null) 626 function _normalize_score($score, $base_count = null)
611 { 627 {
@@ -630,29 +646,24 @@ class Text_LanguageDetect
630 * 646 *
631 * If perl compatible, the score is 300-0, 0 being most similar. 647 * If perl compatible, the score is 300-0, 0 being most similar.
632 * Otherwise, it's 0-1 with 1 being most similar. 648 * Otherwise, it's 0-1 with 1 being most similar.
633 * 649 *
634 * The $sample text should be at least a few sentences in length; 650 * The $sample text should be at least a few sentences in length;
635 * should be ascii-7 or utf8 encoded, if another and the mbstring extension 651 * should be ascii-7 or utf8 encoded, if another and the mbstring extension
636 * is present it will try to detect and convert. However, experience has 652 * is present it will try to detect and convert. However, experience has
637 * shown that mb_detect_encoding() *does not work very well* with at least 653 * shown that mb_detect_encoding() *does not work very well* with at least
638 * some types of encoding. 654 * some types of encoding.
639 * 655 *
640 * @access public 656 * @param string $sample a sample of text to compare.
641 * @param string $sample a sample of text to compare. 657 * @param int $limit if specified, return an array of the most likely
642 * @param int $limit if specified, return an array of the most likely 658 * $limit languages and their scores.
643 * $limit languages and their scores. 659 *
644 * @return mixed sorted array of language scores, blank array if no 660 * @return mixed sorted array of language scores, blank array if no
645 * useable text was found, or PEAR_Error if error 661 * useable text was found
646 * with the object setup 662 * @see _distance()
647 * @see _distance() 663 * @throws Text_LanguageDetect_Exception
648 * @throws PEAR_Error
649 */ 664 */
650 function detect($sample, $limit = 0) 665 public function detect($sample, $limit = 0)
651 { 666 {
652 if (!$this->_setup_ok($err)) {
653 return $err;
654 }
655
656 // input check 667 // input check
657 if (!Text_LanguageDetect_Parser::validateString($sample)) { 668 if (!Text_LanguageDetect_Parser::validateString($sample)) {
658 return array(); 669 return array();
@@ -660,36 +671,27 @@ class Text_LanguageDetect
660 671
661 // check char encoding 672 // check char encoding
662 // (only if mbstring extension is compiled and PHP > 4.0.6) 673 // (only if mbstring extension is compiled and PHP > 4.0.6)
663 if (function_exists('mb_detect_encoding') 674 if (function_exists('mb_detect_encoding')
664 && function_exists('mb_convert_encoding')) { 675 && function_exists('mb_convert_encoding')
665 676 ) {
666 // mb_detect_encoding isn't very reliable, to say the least 677 // mb_detect_encoding isn't very reliable, to say the least
667 // detection should still work with a sufficient sample of ascii characters 678 // detection should still work with a sufficient sample
679 // of ascii characters
668 $encoding = mb_detect_encoding($sample); 680 $encoding = mb_detect_encoding($sample);
669 681
670 // mb_detect_encoding() will return FALSE if detection fails 682 // mb_detect_encoding() will return FALSE if detection fails
671 // don't attempt conversion if that's the case 683 // don't attempt conversion if that's the case
672 if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { 684 if ($encoding != 'ASCII' && $encoding != 'UTF-8'
673 685 && $encoding !== false
674 if (function_exists('mb_list_encodings')) { 686 ) {
675 687 // verify the encoding exists in mb_list_encodings
676 // verify the encoding exists in mb_list_encodings 688 if (in_array($encoding, mb_list_encodings())) {
677 if (in_array($encoding, mb_list_encodings())) { 689 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
678 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
679 }
680
681 // if the previous condition failed:
682 // somehow we detected an encoding that also we don't support
683
684 } else {
685 // php 4 doesnt have mb_list_encodings()
686 // so attempt with error suppression
687 $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding);
688 } 690 }
689 } 691 }
690 } 692 }
691 693
692 $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); 694 $sample_obj = new Text_LanguageDetect_Parser($sample);
693 $sample_obj->prepareTrigram(); 695 $sample_obj->prepareTrigram();
694 if ($this->_use_unicode_narrowing) { 696 if ($this->_use_unicode_narrowing) {
695 $sample_obj->prepareUnicode(); 697 $sample_obj->prepareUnicode();
@@ -713,7 +715,10 @@ class Text_LanguageDetect
713 if (is_array($blocks)) { 715 if (is_array($blocks)) {
714 $present_blocks = array_keys($blocks); 716 $present_blocks = array_keys($blocks);
715 } else { 717 } else {
716 throw new Exception('Error during block detection'); 718 throw new Text_LanguageDetect_Exception(
719 'Error during block detection',
720 Text_LanguageDetect_Exception::BLOCK_DETECTION
721 );
717 } 722 }
718 723
719 $possible_langs = array(); 724 $possible_langs = array();
@@ -731,30 +736,30 @@ class Text_LanguageDetect
731 } 736 }
732 737
733 // could also try an intersect operation rather than a union 738 // could also try an intersect operation rather than a union
734 // in other words, choose languages whose trigrams contain 739 // in other words, choose languages whose trigrams contain
735 // ALL of the unicode blocks found in this sample 740 // ALL of the unicode blocks found in this sample
736 // would improve speed but would be completely thrown off by an 741 // would improve speed but would be completely thrown off by an
737 // unexpected character, like an umlaut appearing in english text 742 // unexpected character, like an umlaut appearing in english text
738 743
739 $possible_langs = array_intersect( 744 $possible_langs = array_intersect(
740 array_keys($this->_lang_db), 745 array_keys($this->_lang_db),
741 array_unique($possible_langs) 746 array_unique($possible_langs)
742 ); 747 );
743 748
744 // needs to intersect it with the keys of _lang_db in case 749 // needs to intersect it with the keys of _lang_db in case
745 // languages have been omitted 750 // languages have been omitted
746 751
747 // or just try 'em all
748 } else { 752 } else {
753 // or just try 'em all
749 $possible_langs = array_keys($this->_lang_db); 754 $possible_langs = array_keys($this->_lang_db);
750 } 755 }
751 756
752 757
753 foreach ($possible_langs as $lang) { 758 foreach ($possible_langs as $lang) {
754 $scores[$lang] = 759 $scores[$lang] = $this->_normalize_score(
755 $this->_normalize_score( 760 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
756 $this->_distance($this->_lang_db[$lang], $trigram_freqs), 761 $trigram_count
757 $trigram_count); 762 );
758 } 763 }
759 764
760 unset($sample_obj); 765 unset($sample_obj);
@@ -772,7 +777,6 @@ class Text_LanguageDetect
772 $limited_scores = array(); 777 $limited_scores = array();
773 778
774 $i = 0; 779 $i = 0;
775
776 foreach ($scores as $key => $value) { 780 foreach ($scores as $key => $value) {
777 if ($i++ >= $limit) { 781 if ($i++ >= $limit) {
778 break; 782 break;
@@ -781,9 +785,9 @@ class Text_LanguageDetect
781 $limited_scores[$key] = $value; 785 $limited_scores[$key] = $value;
782 } 786 }
783 787
784 return $limited_scores; 788 return $this->_convertToNameMode($limited_scores, true);
785 } else { 789 } else {
786 return $scores; 790 return $this->_convertToNameMode($scores, true);
787 } 791 }
788 } 792 }
789 793
@@ -791,35 +795,33 @@ class Text_LanguageDetect
791 * Returns only the most similar language to the text sample 795 * Returns only the most similar language to the text sample
792 * 796 *
793 * Calls $this->detect() and returns only the top result 797 * Calls $this->detect() and returns only the top result
794 * 798 *
795 * @access public 799 * @param string $sample text to detect the language of
796 * @param string $sample text to detect the language of 800 *
797 * @return string the name of the most likely language 801 * @return string the name of the most likely language
798 * or null if no language is similar 802 * or null if no language is similar
799 * @see detect() 803 * @see detect()
800 * @throws PEAR_Error 804 * @throws Text_LanguageDetect_Exception
801 */ 805 */
802 function detectSimple($sample) 806 public function detectSimple($sample)
803 { 807 {
804 $scores = $this->detect($sample, 1); 808 $scores = $this->detect($sample, 1);
805 809
806 // if top language has the maximum possible score, 810 // if top language has the maximum possible score,
807 // then the top score will have been picked at random 811 // then the top score will have been picked at random
808 if ( !is_array($scores) 812 if (!is_array($scores) || empty($scores)
809 || empty($scores) 813 || current($scores) == $this->_max_score
810 || current($scores) == $this->_max_score) { 814 ) {
811
812 return null; 815 return null;
813
814 } else { 816 } else {
815 return ucfirst(key($scores)); 817 return key($scores);
816 } 818 }
817 } 819 }
818 820
819 /** 821 /**
820 * Returns an array containing the most similar language and a confidence 822 * Returns an array containing the most similar language and a confidence
821 * rating 823 * rating
822 * 824 *
823 * Confidence is a simple measure calculated from the similarity score 825 * Confidence is a simple measure calculated from the similarity score
824 * minus the similarity score from the next most similar language 826 * minus the similarity score from the next most similar language
825 * divided by the highest possible score. Languages that have closely 827 * divided by the highest possible score. Languages that have closely
@@ -827,46 +829,43 @@ class Text_LanguageDetect
827 * confidence scores. 829 * confidence scores.
828 * 830 *
829 * The similarity score answers the question "How likely is the text the 831 * The similarity score answers the question "How likely is the text the
830 * returned language regardless of the other languages considered?" The 832 * returned language regardless of the other languages considered?" The
831 * confidence score is one way of answering the question "how likely is the 833 * confidence score is one way of answering the question "how likely is the
832 * text the detected language relative to the rest of the language model 834 * text the detected language relative to the rest of the language model
833 * set?" 835 * set?"
834 * 836 *
835 * To see how similar languages are a priori, see languageSimilarity() 837 * To see how similar languages are a priori, see languageSimilarity()
836 * 838 *
837 * @access public 839 * @param string $sample text for which language will be detected
838 * @param string $sample text for which language will be detected 840 *
839 * @return array most similar language, score and confidence rating 841 * @return array most similar language, score and confidence rating
840 * or null if no language is similar 842 * or null if no language is similar
841 * @see detect() 843 * @see detect()
842 * @throws PEAR_Error 844 * @throws Text_LanguageDetect_Exception
843 */ 845 */
844 function detectConfidence($sample) 846 public function detectConfidence($sample)
845 { 847 {
846 $scores = $this->detect($sample, 2); 848 $scores = $this->detect($sample, 2);
847 849
848 // if most similar language has the max score, it 850 // if most similar language has the max score, it
849 // will have been picked at random 851 // will have been picked at random
850 if ( !is_array($scores) 852 if (!is_array($scores) || empty($scores)
851 || empty($scores) 853 || current($scores) == $this->_max_score
852 || current($scores) == $this->_max_score) { 854 ) {
853
854 return null; 855 return null;
855 } 856 }
856 857
857 $arr['language'] = ucfirst(key($scores)); 858 $arr['language'] = key($scores);
858 $arr['similarity'] = current($scores); 859 $arr['similarity'] = current($scores);
859 if (next($scores) !== false) { // if false then no next element 860 if (next($scores) !== false) { // if false then no next element
860 // the goal is to return a higher value if the distance between 861 // the goal is to return a higher value if the distance between
861 // the similarity of the first score and the second score is high 862 // the similarity of the first score and the second score is high
862 863
863 if ($this->_perl_compatible) { 864 if ($this->_perl_compatible) {
864 865 $arr['confidence'] = (current($scores) - $arr['similarity'])
865 $arr['confidence'] = 866 / $this->_max_score;
866 (current($scores) - $arr['similarity']) / $this->_max_score;
867 867
868 } else { 868 } else {
869
870 $arr['confidence'] = $arr['similarity'] - current($scores); 869 $arr['confidence'] = $arr['similarity'] - current($scores);
871 870
872 } 871 }
@@ -882,32 +881,26 @@ class Text_LanguageDetect
882 * Returns the distribution of unicode blocks in a given utf8 string 881 * Returns the distribution of unicode blocks in a given utf8 string
883 * 882 *
884 * For the block name of a single char, use unicodeBlockName() 883 * For the block name of a single char, use unicodeBlockName()
885 * 884 *
886 * @access public 885 * @param string $str input string. Must be ascii or utf8
887 * @param string $str input string. Must be ascii or utf8 886 * @param bool $skip_symbols if true, skip ascii digits, symbols and
888 * @param bool $skip_symbols if true, skip ascii digits, symbols and 887 * non-printing characters. Includes spaces,
889 * non-printing characters. Includes spaces, 888 * newlines and common punctutation characters.
890 * newlines and common punctutation characters. 889 *
891 * @return array 890 * @return array
892 * @throws PEAR_Error 891 * @throws Text_LanguageDetect_Exception
893 */ 892 */
894 function detectUnicodeBlocks($str, $skip_symbols) 893 public function detectUnicodeBlocks($str, $skip_symbols)
895 { 894 {
896 // input check 895 $skip_symbols = (bool)$skip_symbols;
897 if (!is_bool($skip_symbols)) { 896 $str = (string)$str;
898 throw new Exception('Second parameter must be boolean');
899 }
900
901 if (!is_string($str)) {
902 throw new Exception('First parameter was not a string');
903 }
904 897
905 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 898 $sample_obj = new Text_LanguageDetect_Parser($str);
906 $sample_obj->prepareUnicode(); 899 $sample_obj->prepareUnicode();
907 $sample_obj->prepareTrigram(false); 900 $sample_obj->prepareTrigram(false);
908 $sample_obj->setUnicodeSkipSymbols($skip_symbols); 901 $sample_obj->setUnicodeSkipSymbols($skip_symbols);
909 $sample_obj->analyze(); 902 $sample_obj->analyze();
910 $blocks =& $sample_obj->getUnicodeBlocks(); 903 $blocks = $sample_obj->getUnicodeBlocks();
911 unset($sample_obj); 904 unset($sample_obj);
912 return $blocks; 905 return $blocks;
913 } 906 }
@@ -915,38 +908,37 @@ class Text_LanguageDetect
915 /** 908 /**
916 * Returns the block name for a given unicode value 909 * Returns the block name for a given unicode value
917 * 910 *
918 * If passed a string, will assume it is being passed a UTF8-formatted 911 * If passed a string, will assume it is being passed a UTF8-formatted
919 * character and will automatically convert. Otherwise it will assume it 912 * character and will automatically convert. Otherwise it will assume it
920 * is being passed a numeric unicode value. 913 * is being passed a numeric unicode value.
921 * 914 *
922 * Make sure input is of the correct type! 915 * Make sure input is of the correct type!
923 * 916 *
924 * @access public
925 * @param mixed $unicode unicode value or utf8 char 917 * @param mixed $unicode unicode value or utf8 char
918 *
926 * @return mixed the block name string or false if not found 919 * @return mixed the block name string or false if not found
927 * @throws PEAR_Error 920 * @throws Text_LanguageDetect_Exception
928 */ 921 */
929 function unicodeBlockName($unicode) { 922 public function unicodeBlockName($unicode)
923 {
930 if (is_string($unicode)) { 924 if (is_string($unicode)) {
931 // assume it is being passed a utf8 char, so convert it 925 // assume it is being passed a utf8 char, so convert it
932 926 if (self::utf8strlen($unicode) > 1) {
933 // input check 927 throw new Text_LanguageDetect_Exception(
934 if ($this->utf8strlen($unicode) > 1) { 928 'Pass a single char only to this method',
935 throw new Exception('Pass this function only a single char'); 929 Text_LanguageDetect_Exception::PARAM_TYPE
930 );
936 } 931 }
937
938 $unicode = $this->_utf8char2unicode($unicode); 932 $unicode = $this->_utf8char2unicode($unicode);
939 933
940 if ($unicode == -1) {
941 throw new Exception('Malformatted char');
942 }
943
944 // input check
945 } elseif (!is_int($unicode)) { 934 } elseif (!is_int($unicode)) {
946 throw new Exception('Input must be of type string or int.'); 935 throw new Text_LanguageDetect_Exception(
936 'Input must be of type string or int.',
937 Text_LanguageDetect_Exception::PARAM_TYPE
938 );
947 } 939 }
948 940
949 $blocks =& $this->_read_unicode_block_db(); 941 $blocks = $this->_read_unicode_block_db();
950 942
951 $result = $this->_unicode_block_name($unicode, $blocks); 943 $result = $this->_unicode_block_name($unicode, $blocks);
952 944
@@ -964,14 +956,17 @@ class Text_LanguageDetect
964 * the public interface for this function, which does input checks which 956 * the public interface for this function, which does input checks which
965 * this function omits for speed. 957 * this function omits for speed.
966 * 958 *
967 * @access protected 959 * @param int $unicode the unicode value
968 * @param int $unicode the unicode value 960 * @param array $blocks the block database
969 * @param array &$blocks the block database 961 * @param int $block_count the number of defined blocks in the database
970 * @param int $block_count the number of defined blocks in the database 962 *
971 * @see unicodeBlockName() 963 * @return mixed Block name, -1 if it failed
964 * @see unicodeBlockName()
965 * @access protected
972 */ 966 */
973 function _unicode_block_name($unicode, &$blocks, $block_count = -1) { 967 function _unicode_block_name($unicode, $blocks, $block_count = -1)
974 // for a reference, see 968 {
969 // for a reference, see
975 // http://www.unicode.org/Public/UNIDATA/Blocks.txt 970 // http://www.unicode.org/Public/UNIDATA/Blocks.txt
976 971
977 // assume that ascii characters are the most common 972 // assume that ascii characters are the most common
@@ -994,35 +989,36 @@ class Text_LanguageDetect
994 while ($low <= $high) { 989 while ($low <= $high) {
995 $mid = floor(($low + $high) / 2); 990 $mid = floor(($low + $high) / 2);
996 991
997 // if it's lower than the lower bound
998 if ($unicode < $blocks[$mid][0]) { 992 if ($unicode < $blocks[$mid][0]) {
993 // if it's lower than the lower bound
999 $high = $mid - 1; 994 $high = $mid - 1;
1000 995
1001 // if it's higher than the upper bound
1002 } elseif ($unicode > $blocks[$mid][1]) { 996 } elseif ($unicode > $blocks[$mid][1]) {
997 // if it's higher than the upper bound
1003 $low = $mid + 1; 998 $low = $mid + 1;
1004 999
1005 // found it
1006 } else { 1000 } else {
1001 // found it
1007 return $blocks[$mid]; 1002 return $blocks[$mid];
1008 } 1003 }
1009 } 1004 }
1010 1005
1011 // failed to find the block 1006 // failed to find the block
1012 return -1; 1007 return -1;
1013 1008
1014 // todo: differentiate when it's out of range or when it falls 1009 // todo: differentiate when it's out of range or when it falls
1015 // into an unassigned range? 1010 // into an unassigned range?
1016 } 1011 }
1017 1012
1018 /** 1013 /**
1019 * Brings up the unicode block database 1014 * Brings up the unicode block database
1020 * 1015 *
1021 * @access protected
1022 * @return array the database of unicode block definitions 1016 * @return array the database of unicode block definitions
1023 * @throws PEAR_Error 1017 * @throws Text_LanguageDetect_Exception
1018 * @access protected
1024 */ 1019 */
1025 function &_read_unicode_block_db() { 1020 function _read_unicode_block_db()
1021 {
1026 // since the unicode definitions are always going to be the same, 1022 // since the unicode definitions are always going to be the same,
1027 // might as well share the memory for the db with all other instances 1023 // might as well share the memory for the db with all other instances
1028 // of this class 1024 // of this class
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect
1037 1033
1038 /** 1034 /**
1039 * Calculate the similarities between the language models 1035 * Calculate the similarities between the language models
1040 * 1036 *
1041 * Use this function to see how similar languages are to each other. 1037 * Use this function to see how similar languages are to each other.
1042 * 1038 *
1043 * If passed 2 language names, will return just those languages compared. 1039 * If passed 2 language names, will return just those languages compared.
1044 * If passed 1 language name, will return that language compared to 1040 * If passed 1 language name, will return that language compared to
1045 * all others. 1041 * all others.
1046 * If passed none, will return an array of every language model compared 1042 * If passed none, will return an array of every language model compared
1047 * to every other one. 1043 * to every other one.
1048 * 1044 *
1049 * @access public 1045 * @param string $lang1 the name of the first language to be compared
1050 * @param string $lang1 the name of the first language to be compared 1046 * @param string $lang2 the name of the second language to be compared
1051 * @param string $lang2 the name of the second language to be compared 1047 *
1052 * @return array scores of every language compared 1048 * @return array scores of every language compared
1053 * or the score of just the provided languages 1049 * or the score of just the provided languages
1054 * or null if one of the supplied languages does not exist 1050 * or null if one of the supplied languages does not exist
1055 * @throws PEAR_Error 1051 * @throws Text_LanguageDetect_Exception
1056 */ 1052 */
1057 function languageSimilarity($lang1 = null, $lang2 = null) 1053 public function languageSimilarity($lang1 = null, $lang2 = null)
1058 { 1054 {
1059 if (!$this->_setup_ok($err)) { 1055 $lang1 = $this->_convertFromNameMode($lang1);
1060 return $err; 1056 $lang2 = $this->_convertFromNameMode($lang2);
1061 }
1062
1063 if ($lang1 != null) { 1057 if ($lang1 != null) {
1064 $lang1 = strtolower($lang1); 1058 $lang1 = strtolower($lang1);
1065 1059
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect
1069 } 1063 }
1070 1064
1071 if ($lang2 != null) { 1065 if ($lang2 != null) {
1072 1066 if (!isset($this->_lang_db[$lang2])) {
1073 // can't only set the second param 1067 // check if language model exists
1074 if ($lang1 == null) {
1075 return null;
1076 // check if language model exists
1077 } elseif (!isset($this->_lang_db[$lang2])) {
1078 return null; 1068 return null;
1079 } 1069 }
1080 1070
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect
1088 ) 1078 )
1089 ); 1079 );
1090 1080
1091
1092 // compare just $lang1 to all languages
1093 } else { 1081 } else {
1082 // compare just $lang1 to all languages
1094 $return_arr = array(); 1083 $return_arr = array();
1095 foreach ($this->_lang_db as $key => $value) { 1084 foreach ($this->_lang_db as $key => $value) {
1096 if ($key != $lang1) { // don't compare a language to itself 1085 if ($key != $lang1) {
1086 // don't compare a language to itself
1097 $return_arr[$key] = $this->_normalize_score( 1087 $return_arr[$key] = $this->_normalize_score(
1098 $this->_distance($this->_lang_db[$lang1], $value)); 1088 $this->_distance($this->_lang_db[$lang1], $value)
1089 );
1099 } 1090 }
1100 } 1091 }
1101 asort($return_arr); 1092 asort($return_arr);
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect
1104 } 1095 }
1105 1096
1106 1097
1107 // compare all languages to each other
1108 } else { 1098 } else {
1099 // compare all languages to each other
1109 $return_arr = array(); 1100 $return_arr = array();
1110 foreach (array_keys($this->_lang_db) as $lang1) { 1101 foreach (array_keys($this->_lang_db) as $lang1) {
1111 foreach (array_keys($this->_lang_db) as $lang2) { 1102 foreach (array_keys($this->_lang_db) as $lang2) {
1112
1113 // skip comparing languages to themselves 1103 // skip comparing languages to themselves
1114 if ($lang1 != $lang2) { 1104 if ($lang1 != $lang2) {
1115
1116 // don't re-calculate what's already been done
1117 if (isset($return_arr[$lang2][$lang1])) {
1118 1105
1119 $return_arr[$lang1][$lang2] = 1106 if (isset($return_arr[$lang2][$lang1])) {
1120 $return_arr[$lang2][$lang1]; 1107 // don't re-calculate what's already been done
1108 $return_arr[$lang1][$lang2]
1109 = $return_arr[$lang2][$lang1];
1121 1110
1122 // calculate
1123 } else { 1111 } else {
1124 1112 // calculate
1125 $return_arr[$lang1][$lang2] = 1113 $return_arr[$lang1][$lang2]
1126 $this->_normalize_score( 1114 = $this->_normalize_score(
1127 $this->_distance( 1115 $this->_distance(
1128 $this->_lang_db[$lang1], 1116 $this->_lang_db[$lang1],
1129 $this->_lang_db[$lang2] 1117 $this->_lang_db[$lang2]
1130 ) 1118 )
1131 ); 1119 );
1132 1120
1133 } 1121 }
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect
1150 * 1138 *
1151 * @access public 1139 * @access public
1152 * @return array language cluster data 1140 * @return array language cluster data
1153 * @throws PEAR_Error 1141 * @throws Text_LanguageDetect_Exception
1154 * @see languageSimilarity() 1142 * @see languageSimilarity()
1155 * @deprecated this function will eventually be removed and placed into 1143 * @deprecated this function will eventually be removed and placed into
1156 * the model generation class 1144 * the model generation class
1157 */ 1145 */
1158 function clusterLanguages() 1146 function clusterLanguages()
1159 { 1147 {
1160 // todo: set the maximum number of clusters 1148 // todo: set the maximum number of clusters
1161
1162 // setup check
1163 if (!$this->_setup_ok($err)) {
1164 return $err;
1165 }
1166
1167 // return cached result, if any 1149 // return cached result, if any
1168 if (isset($this->_clusters)) { 1150 if (isset($this->_clusters)) {
1169 return $this->_clusters; 1151 return $this->_clusters;
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect
1177 1159
1178 foreach ($langs as $lang) { 1160 foreach ($langs as $lang) {
1179 if (!isset($this->_lang_db[$lang])) { 1161 if (!isset($this->_lang_db[$lang])) {
1180 throw new Exception("missing $lang!\n"); 1162 throw new Text_LanguageDetect_Exception(
1163 "missing $lang!",
1164 Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1165 );
1181 } 1166 }
1182 } 1167 }
1183 1168
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect
1186 $langs[$lang1] = $lang1; 1171 $langs[$lang1] = $lang1;
1187 unset($langs[$old_key]); 1172 unset($langs[$old_key]);
1188 } 1173 }
1189 1174
1175 $result_data = $really_map = array();
1176
1190 $i = 0; 1177 $i = 0;
1191 while (count($langs) > 2 && $i++ < 200) { 1178 while (count($langs) > 2 && $i++ < 200) {
1192 $highest_score = -1; 1179 $highest_score = -1;
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect
1194 $highest_key2 = ''; 1181 $highest_key2 = '';
1195 foreach ($langs as $lang1) { 1182 foreach ($langs as $lang1) {
1196 foreach ($langs as $lang2) { 1183 foreach ($langs as $lang2) {
1197 if ( $lang1 != $lang2 1184 if ($lang1 != $lang2
1198 && $arr[$lang1][$lang2] > $highest_score) { 1185 && $arr[$lang1][$lang2] > $highest_score
1186 ) {
1199 $highest_score = $arr[$lang1][$lang2]; 1187 $highest_score = $arr[$lang1][$lang2];
1200 $highest_key1 = $lang1; 1188 $highest_key1 = $lang1;
1201 $highest_key2 = $lang2; 1189 $highest_key2 = $lang2;
1202 } 1190 }
1203 } 1191 }
1204 } 1192 }
1205 1193
1206 if (!$highest_key1) { 1194 if (!$highest_key1) {
1207 // should not ever happen 1195 // should not ever happen
1208 throw new Exception("no highest key? (step: $i)"); 1196 throw new Text_LanguageDetect_Exception(
1197 "no highest key? (step: $i)",
1198 Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1199 );
1209 } 1200 }
1210 1201
1211 if ($highest_score == 0) { 1202 if ($highest_score == 0) {
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect
1217 $sum1 = array_sum($arr[$highest_key1]); 1208 $sum1 = array_sum($arr[$highest_key1]);
1218 $sum2 = array_sum($arr[$highest_key2]); 1209 $sum2 = array_sum($arr[$highest_key2]);
1219 1210
1220 // use the score for the one that is most similar to the rest of 1211 // use the score for the one that is most similar to the rest of
1221 // the field as the score for the group 1212 // the field as the score for the group
1222 // todo: could try averaging or "centroid" method instead 1213 // todo: could try averaging or "centroid" method instead
1223 // seems like that might make more sense 1214 // seems like that might make more sense
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect
1248 $really_lang = $replaceme; 1239 $really_lang = $replaceme;
1249 while (isset($really_map[$really_lang])) { 1240 while (isset($really_map[$really_lang])) {
1250 $really_lang = $really_map[$really_lang]; 1241 $really_lang = $really_map[$really_lang];
1251 } 1242 }
1252 $really_map[$newkey] = $really_lang; 1243 $really_map[$newkey] = $really_lang;
1253 1244
1254 1245
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect
1259 $arr[$key1][$newkey] = $arr[$key1][$key2]; 1250 $arr[$key1][$newkey] = $arr[$key1][$key2];
1260 unset($arr[$key1][$key2]); 1251 unset($arr[$key1][$key2]);
1261 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] 1252 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1262 } 1253 }
1263 1254
1264 if ($key1 == $replaceme) { 1255 if ($key1 == $replaceme) {
1265 $arr[$newkey][$key2] = $arr[$key1][$key2]; 1256 $arr[$newkey][$key2] = $arr[$key1][$key2];
1266 unset($arr[$key1][$key2]); 1257 unset($arr[$key1][$key2]);
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect
1273 } 1264 }
1274 } 1265 }
1275 } 1266 }
1276 1267
1277 1268
1278 unset($langs[$highest_key1]); 1269 unset($langs[$highest_key1]);
1279 unset($langs[$highest_key2]); 1270 unset($langs[$highest_key2]);
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect
1293 } 1284 }
1294 1285
1295 $return_val = array( 1286 $return_val = array(
1296 'open_forks' => $langs, 1287 'open_forks' => $langs,
1297 // the top level of clusters 1288 // the top level of clusters
1298 // clusters that are mutually exclusive 1289 // clusters that are mutually exclusive
1299 // or specified by a specific maximum 1290 // or specified by a specific maximum
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect
1323 * use, and it may disappear or its functionality may change in future 1314 * use, and it may disappear or its functionality may change in future
1324 * releases without notice. 1315 * releases without notice.
1325 * 1316 *
1326 * This compares the sample text to top the top level of clusters. If the 1317 * This compares the sample text to top the top level of clusters. If the
1327 * sample is similar to the cluster it will drop down and compare it to the 1318 * sample is similar to the cluster it will drop down and compare it to the
1328 * languages in the cluster, and so on until it hits a leaf node. 1319 * languages in the cluster, and so on until it hits a leaf node.
1329 * 1320 *
1330 * this should find the language in considerably fewer compares 1321 * this should find the language in considerably fewer compares
1331 * (the equivalent of a binary search), however clusterLanguages() is costly 1322 * (the equivalent of a binary search), however clusterLanguages() is costly
1332 * and the loss of accuracy from this technique is significant. 1323 * and the loss of accuracy from this technique is significant.
1333 * 1324 *
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect
1337 * was very large, however in such cases some method of Bayesian inference 1328 * was very large, however in such cases some method of Bayesian inference
1338 * might be more helpful. 1329 * might be more helpful.
1339 * 1330 *
1340 * @see clusterLanguages() 1331 * @param string $str input string
1341 * @access public 1332 *
1342 * @param string $str input string 1333 * @return array language scores (only those compared)
1343 * @return array language scores (only those compared) 1334 * @throws Text_LanguageDetect_Exception
1344 * @throws PEAR_Error 1335 * @see clusterLanguages()
1345 */ 1336 */
1346 function clusteredSearch($str) 1337 public function clusteredSearch($str)
1347 { 1338 {
1348
1349 // input check 1339 // input check
1350 if (!Text_LanguageDetect_Parser::validateString($str)) { 1340 if (!Text_LanguageDetect_Parser::validateString($str)) {
1351 return array(); 1341 return array();
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect
1359 $dendogram_data = $result['fork_data']; 1349 $dendogram_data = $result['fork_data'];
1360 $dendogram_alias = $result['name_map']; 1350 $dendogram_alias = $result['name_map'];
1361 1351
1362 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 1352 $sample_obj = new Text_LanguageDetect_Parser($str);
1363 $sample_obj->prepareTrigram(); 1353 $sample_obj->prepareTrigram();
1364 $sample_obj->setPadStart(!$this->_perl_compatible); 1354 $sample_obj->setPadStart(!$this->_perl_compatible);
1365 $sample_obj->analyze(); 1355 $sample_obj->analyze();
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect
1372 } 1362 }
1373 1363
1374 $i = 0; // counts the number of steps 1364 $i = 0; // counts the number of steps
1375 1365
1376 foreach ($dendogram_start as $lang) { 1366 foreach ($dendogram_start as $lang) {
1377 if (isset($dendogram_alias[$lang])) { 1367 if (isset($dendogram_alias[$lang])) {
1378 $lang_key = $dendogram_alias[$lang]; 1368 $lang_key = $dendogram_alias[$lang];
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect
1382 1372
1383 $scores[$lang] = $this->_normalize_score( 1373 $scores[$lang] = $this->_normalize_score(
1384 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1374 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1385 $sample_count); 1375 $sample_count
1376 );
1386 1377
1387 $i++; 1378 $i++;
1388 } 1379 }
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect
1411 1402
1412 $scores[$lang] = $this->_normalize_score( 1403 $scores[$lang] = $this->_normalize_score(
1413 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1404 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1414 $sample_count); 1405 $sample_count
1406 );
1415 1407
1416 //todo: does not need to do same comparison again 1408 //todo: does not need to do same comparison again
1417 } 1409 }
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect
1428 1420
1429 $diff = $scores[$cur_key] - $scores[$loser_key]; 1421 $diff = $scores[$cur_key] - $scores[$loser_key];
1430 1422
1431 // $cur_key ({$dendogram_alias[$cur_key]}) wins 1423 // $cur_key ({$dendogram_alias[$cur_key]}) wins
1432 // over $loser_key ({$dendogram_alias[$loser_key]}) 1424 // over $loser_key ({$dendogram_alias[$loser_key]})
1433 // with a difference of $diff 1425 // with a difference of $diff
1434 } 1426 }
1435 1427
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect
1439 // which paths the algorithm decided to take along the tree 1431 // which paths the algorithm decided to take along the tree
1440 1432
1441 // but sometimes the last item is only the second highest 1433 // but sometimes the last item is only the second highest
1442 if ( ($this->_perl_compatible && (end($scores) > prev($scores))) 1434 if (($this->_perl_compatible && (end($scores) > prev($scores)))
1443 || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { 1435 || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1444 1436 ) {
1445 $real_last_score = current($scores); 1437 $real_last_score = current($scores);
1446 $real_last_key = key($scores); 1438 $real_last_key = key($scores);
1447 1439
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect
1449 unset($scores[$real_last_key]); 1441 unset($scores[$real_last_key]);
1450 $scores[$real_last_key] = $real_last_score; 1442 $scores[$real_last_key] = $real_last_score;
1451 } 1443 }
1452 1444
1453 1445
1454 if (!$this->_perl_compatible) { 1446 if (!$this->_perl_compatible) {
1455 $scores = array_reverse($scores, true); 1447 $scores = array_reverse($scores, true);
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect
1464 * 1456 *
1465 * Returns the numbers of characters (not bytes) in a utf8 string 1457 * Returns the numbers of characters (not bytes) in a utf8 string
1466 * 1458 *
1467 * @static 1459 * @param string $str string to get the length of
1468 * @access public 1460 *
1469 * @param string $str string to get the length of 1461 * @return int number of chars
1470 * @return int number of chars
1471 */ 1462 */
1472 function utf8strlen($str) 1463 public static function utf8strlen($str)
1473 { 1464 {
1474 // utf8_decode() will convert unknown chars to '?', which is actually 1465 // utf8_decode() will convert unknown chars to '?', which is actually
1475 // ideal for counting. 1466 // ideal for counting.
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect
1482 /** 1473 /**
1483 * Returns the unicode value of a utf8 char 1474 * Returns the unicode value of a utf8 char
1484 * 1475 *
1485 * @access protected 1476 * @param string $char a utf8 (possibly multi-byte) char
1486 * @param string $char a utf8 (possibly multi-byte) char 1477 *
1487 * @return int unicode value or -1 if malformatted 1478 * @return int unicode value
1479 * @access protected
1480 * @link http://en.wikipedia.org/wiki/UTF-8
1488 */ 1481 */
1489 function _utf8char2unicode($char) { 1482 function _utf8char2unicode($char)
1490 1483 {
1491 // strlen() here will actually get the binary length of a single char 1484 // strlen() here will actually get the binary length of a single char
1492 switch (strlen($char)) { 1485 switch (strlen($char)) {
1493 1486 case 1:
1494 // for a reference, see http://en.wikipedia.org/wiki/UTF-8 1487 // normal ASCII-7 byte
1495 1488 // 0xxxxxxx --> 0xxxxxxx
1496 case 1: 1489 return ord($char{0});
1497 // normal ASCII-7 byte 1490
1498 // 0xxxxxxx --> 0xxxxxxx 1491 case 2:
1499 return ord($char{0}); 1492 // 2 byte unicode
1500 1493 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1501 case 2: 1494 $z = (ord($char{0}) & 0x000001F) << 6;
1502 // 2 byte unicode 1495 $x = (ord($char{1}) & 0x0000003F);
1503 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx 1496 return ($z | $x);
1504 $z = (ord($char{0}) & 0x000001F) << 6; 1497
1505 $x = (ord($char{1}) & 0x0000003F); 1498 case 3:
1506 1499 // 3 byte unicode
1507 return ($z | $x); 1500 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1508 1501 $z = (ord($char{0}) & 0x0000000F) << 12;
1509 case 3: 1502 $x1 = (ord($char{1}) & 0x0000003F) << 6;
1510 // 3 byte unicode 1503 $x2 = (ord($char{2}) & 0x0000003F);
1511 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx 1504 return ($z | $x1 | $x2);
1512 $z = (ord($char{0}) & 0x0000000F) << 12; 1505
1513 $x1 = (ord($char{1}) & 0x0000003F) << 6; 1506 case 4:
1514 $x2 = (ord($char{2}) & 0x0000003F); 1507 // 4 byte unicode
1515 1508 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1516 return ($z | $x1 | $x2); 1509 // 000zzzzz xxxxxxxx xxxxxxxx
1517 1510 $z1 = (ord($char{0}) & 0x00000007) << 18;
1518 case 4: 1511 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1519 // 4 byte unicode 1512 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1520 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> 1513 $x2 = (ord($char{3}) & 0x0000003F);
1521 // 000zzzzz xxxxxxxx xxxxxxxx 1514 return ($z1 | $z2 | $x1 | $x2);
1522 $z1 = (ord($char{0}) & 0x00000007) << 18;
1523 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1524 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1525 $x2 = (ord($char{3}) & 0x0000003F);
1526
1527 return ($z1 | $z2 | $x1 | $x2);
1528
1529 default:
1530 // error: malformatted char?
1531 return -1;
1532 } 1515 }
1533 } 1516 }
1534 1517
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect
1536 * utf8-safe fast character iterator 1519 * utf8-safe fast character iterator
1537 * 1520 *
1538 * Will get the next character starting from $counter, which will then be 1521 * Will get the next character starting from $counter, which will then be
1539 * incremented. If a multi-byte char the bytes will be concatenated and 1522 * incremented. If a multi-byte char the bytes will be concatenated and
1540 * $counter will be incremeted by the number of bytes in the char. 1523 * $counter will be incremeted by the number of bytes in the char.
1541 * 1524 *
1542 * @access private 1525 * @param string $str the string being iterated over
1543 * @param string &$str the string being iterated over 1526 * @param int &$counter the iterator, will increment by reference
1544 * @param int &$counter the iterator, will increment by reference 1527 * @param bool $special_convert whether to do special conversions
1545 * @param bool $special_convert whether to do special conversions 1528 *
1546 * @return char the next (possibly multi-byte) char from $counter 1529 * @return char the next (possibly multi-byte) char from $counter
1530 * @access private
1547 */ 1531 */
1548 function _next_char(&$str, &$counter, $special_convert = false) 1532 static function _next_char($str, &$counter, $special_convert = false)
1549 { 1533 {
1550
1551 $char = $str{$counter++}; 1534 $char = $str{$counter++};
1552 $ord = ord($char); 1535 $ord = ord($char);
1553 1536
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect
1556 1539
1557 // normal ascii one byte char 1540 // normal ascii one byte char
1558 if ($ord <= 127) { 1541 if ($ord <= 127) {
1559
1560 // special conversions needed for this package 1542 // special conversions needed for this package
1561 // (that only apply to regular ascii characters) 1543 // (that only apply to regular ascii characters)
1562 // lower case, and convert all non-alphanumeric characters 1544 // lower case, and convert all non-alphanumeric characters
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect
1571 1553
1572 return $char; 1554 return $char;
1573 1555
1574 // multi-byte chars
1575 } elseif ($ord >> 5 == 6) { // two-byte char 1556 } elseif ($ord >> 5 == 6) { // two-byte char
1557 // multi-byte chars
1576 $nextchar = $str{$counter++}; // get next byte 1558 $nextchar = $str{$counter++}; // get next byte
1577 1559
1578 // lower-casing of non-ascii characters is still incomplete 1560 // lower-casing of non-ascii characters is still incomplete
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect
1582 if ($ord == 195) { 1564 if ($ord == 195) {
1583 $nextord = ord($nextchar); 1565 $nextord = ord($nextchar);
1584 $nextord_adj = $nextord + 64; 1566 $nextord_adj = $nextord + 64;
1585 // for a reference, see 1567 // for a reference, see
1586 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html 1568 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1587 1569
1588 // &Agrave; - &THORN; but not &times; 1570 // &Agrave; - &THORN; but not &times;
1589 if ( $nextord_adj >= 192 1571 if ($nextord_adj >= 192
1590 && $nextord_adj <= 222 1572 && $nextord_adj <= 222
1591 && $nextord_adj != 215) { 1573 && $nextord_adj != 215
1592 1574 ) {
1593 $nextchar = chr($nextord + 32); 1575 $nextchar = chr($nextord + 32);
1594 } 1576 }
1595 1577
1596 // lower case cyrillic alphabet
1597 } elseif ($ord == 208) { 1578 } elseif ($ord == 208) {
1579 // lower case cyrillic alphabet
1598 $nextord = ord($nextchar); 1580 $nextord = ord($nextchar);
1599 // if A - Pe 1581 // if A - Pe
1600 if ($nextord >= 144 && $nextord <= 159) { 1582 if ($nextord >= 144 && $nextord <= 159) {
1601 // lower case 1583 // lower case
1602 $nextchar = chr($nextord + 32); 1584 $nextchar = chr($nextord + 32);
1603 1585
1604 // if Er - Ya
1605 } elseif ($nextord >= 160 && $nextord <= 175) { 1586 } elseif ($nextord >= 160 && $nextord <= 175) {
1587 // if Er - Ya
1606 // lower case 1588 // lower case
1607 $char = chr(209); // == $ord++ 1589 $char = chr(209); // == $ord++
1608 $nextchar = chr($nextord - 32); 1590 $nextchar = chr($nextord - 32);
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect
1611 } 1593 }
1612 1594
1613 // tag on next byte 1595 // tag on next byte
1614 return $char . $nextchar; 1596 return $char . $nextchar;
1615
1616 } elseif ($ord >> 4 == 14) { // three-byte char 1597 } elseif ($ord >> 4 == 14) { // three-byte char
1617 1598
1618 // tag on next 2 bytes 1599 // tag on next 2 bytes
1619 return $char . $str{$counter++} . $str{$counter++}; 1600 return $char . $str{$counter++} . $str{$counter++};
1620 1601
1621 } elseif ($ord >> 3 == 30) { // four-byte char 1602 } elseif ($ord >> 3 == 30) { // four-byte char
1622 1603
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect
1628 } 1609 }
1629 } 1610 }
1630 1611
1631} 1612 /**
1613 * Converts an $language input parameter from the configured mode
1614 * to the language name that is used internally.
1615 *
1616 * Works for strings and arrays.
1617 *
1618 * @param string|array $lang A language description ("english"/"en"/"eng")
1619 * @param boolean $convertKey If $lang is an array, setting $key
1620 * converts the keys to the language name.
1621 *
1622 * @return string|array Language name
1623 */
1624 function _convertFromNameMode($lang, $convertKey = false)
1625 {
1626 if ($this->_name_mode == 0) {
1627 return $lang;
1628 }
1629
1630 if ($this->_name_mode == 2) {
1631 $method = 'code2ToName';
1632 } else {
1633 $method = 'code3ToName';
1634 }
1635
1636 if (is_string($lang)) {
1637 return (string)Text_LanguageDetect_ISO639::$method($lang);
1638 }
1639
1640 $newlang = array();
1641 foreach ($lang as $key => $val) {
1642 if ($convertKey) {
1643 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1644 $newlang[$newkey] = $val;
1645 } else {
1646 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1647 }
1648 }
1649 return $newlang;
1650 }
1632 1651
1633/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 1652 /**
1653 * Converts an $language output parameter from the language name that is
1654 * used internally to the configured mode.
1655 *
1656 * Works for strings and arrays.
1657 *
1658 * @param string|array $lang A language description ("english"/"en"/"eng")
1659 * @param boolean $convertKey If $lang is an array, setting $key
1660 * converts the keys to the language name.
1661 *
1662 * @return string|array Language name
1663 */
1664 function _convertToNameMode($lang, $convertKey = false)
1665 {
1666 if ($this->_name_mode == 0) {
1667 return $lang;
1668 }
1669
1670 if ($this->_name_mode == 2) {
1671 $method = 'nameToCode2';
1672 } else {
1673 $method = 'nameToCode3';
1674 }
1675
1676 if (is_string($lang)) {
1677 return Text_LanguageDetect_ISO639::$method($lang);
1678 }
1679
1680 $newlang = array();
1681 foreach ($lang as $key => $val) {
1682 if ($convertKey) {
1683 $newkey = Text_LanguageDetect_ISO639::$method($key);
1684 $newlang[$newkey] = $val;
1685 } else {
1686 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1687 }
1688 }
1689 return $newlang;
1690 }
1691}
1634 1692
1635?> 1693/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
1<?php
2class Text_LanguageDetect_Exception extends Exception
3{
4 /**
5 * Database file could not be found
6 */
7 const DB_NOT_FOUND = 10;
8
9 /**
10 * Database file found, but not readable
11 */
12 const DB_NOT_READABLE = 11;
13
14 /**
15 * Database file is empty
16 */
17 const DB_EMPTY = 12;
18
19 /**
20 * Database contents is not a PHP array
21 */
22 const DB_NOT_ARRAY = 13;
23
24 /**
25 * Magic quotes are activated
26 */
27 const MAGIC_QUOTES = 14;
28
29
30 /**
31 * Parameter of invalid type passed to method
32 */
33 const PARAM_TYPE = 20;
34
35 /**
36 * Character in parameter is invalid
37 */
38 const INVALID_CHAR = 21;
39
40
41 /**
42 * Language is not in the database
43 */
44 const UNKNOWN_LANGUAGE = 30;
45
46
47 /**
48 * Error during block detection
49 */
50 const BLOCK_DETECTION = 40;
51
52
53 /**
54 * Error while clustering languages
55 */
56 const NO_HIGHEST_KEY = 50;
57}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
1<?php
2/**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Christian Weiske <cweiske@php.net>
10 * @copyright 2011 Christian Weiske <cweiske@php.net>
11 * @license http://www.debian.org/misc/bsd.license BSD
12 * @version SVN: $Id$
13 * @link http://pear.php.net/package/Text_LanguageDetect/
14 */
15
16/**
17 * Provides a mapping between the languages from lang.dat and the
18 * ISO 639-1 and ISO-639-2 codes.
19 *
20 * Note that this class contains only languages that exist in lang.dat.
21 *
22 * @category Text
23 * @package Text_LanguageDetect
24 * @author Christian Weiske <cweiske@php.net>
25 * @copyright 2011 Christian Weiske <cweiske@php.net>
26 * @license http://www.debian.org/misc/bsd.license BSD
27 * @link http://www.loc.gov/standards/iso639-2/php/code_list.php
28 */
29class Text_LanguageDetect_ISO639
30{
31 /**
32 * Maps all language names from the language database to the
33 * ISO 639-1 2-letter language code.
34 *
35 * NULL indicates that there is no 2-letter code.
36 *
37 * @var array
38 */
39 public static $nameToCode2 = array(
40 'albanian' => 'sq',
41 'arabic' => 'ar',
42 'azeri' => 'az',
43 'bengali' => 'bn',
44 'bulgarian' => 'bg',
45 'cebuano' => null,
46 'croatian' => 'hr',
47 'czech' => 'cs',
48 'danish' => 'da',
49 'dutch' => 'nl',
50 'english' => 'en',
51 'estonian' => 'et',
52 'farsi' => 'fa',
53 'finnish' => 'fi',
54 'french' => 'fr',
55 'german' => 'de',
56 'hausa' => 'ha',
57 'hawaiian' => null,
58 'hindi' => 'hi',
59 'hungarian' => 'hu',
60 'icelandic' => 'is',
61 'indonesian' => 'id',
62 'italian' => 'it',
63 'kazakh' => 'kk',
64 'kyrgyz' => 'ky',
65 'latin' => 'la',
66 'latvian' => 'lv',
67 'lithuanian' => 'lt',
68 'macedonian' => 'mk',
69 'mongolian' => 'mn',
70 'nepali' => 'ne',
71 'norwegian' => 'no',
72 'pashto' => 'ps',
73 'pidgin' => null,
74 'polish' => 'pl',
75 'portuguese' => 'pt',
76 'romanian' => 'ro',
77 'russian' => 'ru',
78 'serbian' => 'sr',
79 'slovak' => 'sk',
80 'slovene' => 'sl',
81 'somali' => 'so',
82 'spanish' => 'es',
83 'swahili' => 'sw',
84 'swedish' => 'sv',
85 'tagalog' => 'tl',
86 'turkish' => 'tr',
87 'ukrainian' => 'uk',
88 'urdu' => 'ur',
89 'uzbek' => 'uz',
90 'vietnamese' => 'vi',
91 'welsh' => 'cy',
92 );
93
94 /**
95 * Maps all language names from the language database to the
96 * ISO 639-2 3-letter language code.
97 *
98 * @var array
99 */
100 public static $nameToCode3 = array(
101 'albanian' => 'sqi',
102 'arabic' => 'ara',
103 'azeri' => 'aze',
104 'bengali' => 'ben',
105 'bulgarian' => 'bul',
106 'cebuano' => 'ceb',
107 'croatian' => 'hrv',
108 'czech' => 'ces',
109 'danish' => 'dan',
110 'dutch' => 'nld',
111 'english' => 'eng',
112 'estonian' => 'est',
113 'farsi' => 'fas',
114 'finnish' => 'fin',
115 'french' => 'fra',
116 'german' => 'deu',
117 'hausa' => 'hau',
118 'hawaiian' => 'haw',
119 'hindi' => 'hin',
120 'hungarian' => 'hun',
121 'icelandic' => 'isl',
122 'indonesian' => 'ind',
123 'italian' => 'ita',
124 'kazakh' => 'kaz',
125 'kyrgyz' => 'kir',
126 'latin' => 'lat',
127 'latvian' => 'lav',
128 'lithuanian' => 'lit',
129 'macedonian' => 'mkd',
130 'mongolian' => 'mon',
131 'nepali' => 'nep',
132 'norwegian' => 'nor',
133 'pashto' => 'pus',
134 'pidgin' => 'crp',
135 'polish' => 'pol',
136 'portuguese' => 'por',
137 'romanian' => 'ron',
138 'russian' => 'rus',
139 'serbian' => 'srp',
140 'slovak' => 'slk',
141 'slovene' => 'slv',
142 'somali' => 'som',
143 'spanish' => 'spa',
144 'swahili' => 'swa',
145 'swedish' => 'swe',
146 'tagalog' => 'tgl',
147 'turkish' => 'tur',
148 'ukrainian' => 'ukr',
149 'urdu' => 'urd',
150 'uzbek' => 'uzb',
151 'vietnamese' => 'vie',
152 'welsh' => 'cym',
153 );
154
155 /**
156 * Maps ISO 639-1 2-letter language codes to the language names
157 * in the language database
158 *
159 * Not all languages have a 2 letter code, so some are missing
160 *
161 * @var array
162 */
163 public static $code2ToName = array(
164 'ar' => 'arabic',
165 'az' => 'azeri',
166 'bg' => 'bulgarian',
167 'bn' => 'bengali',
168 'cs' => 'czech',
169 'cy' => 'welsh',
170 'da' => 'danish',
171 'de' => 'german',
172 'en' => 'english',
173 'es' => 'spanish',
174 'et' => 'estonian',
175 'fa' => 'farsi',
176 'fi' => 'finnish',
177 'fr' => 'french',
178 'ha' => 'hausa',
179 'hi' => 'hindi',
180 'hr' => 'croatian',
181 'hu' => 'hungarian',
182 'id' => 'indonesian',
183 'is' => 'icelandic',
184 'it' => 'italian',
185 'kk' => 'kazakh',
186 'ky' => 'kyrgyz',
187 'la' => 'latin',
188 'lt' => 'lithuanian',
189 'lv' => 'latvian',
190 'mk' => 'macedonian',
191 'mn' => 'mongolian',
192 'ne' => 'nepali',
193 'nl' => 'dutch',
194 'no' => 'norwegian',
195 'pl' => 'polish',
196 'ps' => 'pashto',
197 'pt' => 'portuguese',
198 'ro' => 'romanian',
199 'ru' => 'russian',
200 'sk' => 'slovak',
201 'sl' => 'slovene',
202 'so' => 'somali',
203 'sq' => 'albanian',
204 'sr' => 'serbian',
205 'sv' => 'swedish',
206 'sw' => 'swahili',
207 'tl' => 'tagalog',
208 'tr' => 'turkish',
209 'uk' => 'ukrainian',
210 'ur' => 'urdu',
211 'uz' => 'uzbek',
212 'vi' => 'vietnamese',
213 );
214
215 /**
216 * Maps ISO 639-2 3-letter language codes to the language names
217 * in the language database.
218 *
219 * @var array
220 */
221 public static $code3ToName = array(
222 'ara' => 'arabic',
223 'aze' => 'azeri',
224 'ben' => 'bengali',
225 'bul' => 'bulgarian',
226 'ceb' => 'cebuano',
227 'ces' => 'czech',
228 'crp' => 'pidgin',
229 'cym' => 'welsh',
230 'dan' => 'danish',
231 'deu' => 'german',
232 'eng' => 'english',
233 'est' => 'estonian',
234 'fas' => 'farsi',
235 'fin' => 'finnish',
236 'fra' => 'french',
237 'hau' => 'hausa',
238 'haw' => 'hawaiian',
239 'hin' => 'hindi',
240 'hrv' => 'croatian',
241 'hun' => 'hungarian',
242 'ind' => 'indonesian',
243 'isl' => 'icelandic',
244 'ita' => 'italian',
245 'kaz' => 'kazakh',
246 'kir' => 'kyrgyz',
247 'lat' => 'latin',
248 'lav' => 'latvian',
249 'lit' => 'lithuanian',
250 'mkd' => 'macedonian',
251 'mon' => 'mongolian',
252 'nep' => 'nepali',
253 'nld' => 'dutch',
254 'nor' => 'norwegian',
255 'pol' => 'polish',
256 'por' => 'portuguese',
257 'pus' => 'pashto',
258 'rom' => 'romanian',
259 'rus' => 'russian',
260 'slk' => 'slovak',
261 'slv' => 'slovene',
262 'som' => 'somali',
263 'spa' => 'spanish',
264 'sqi' => 'albanian',
265 'srp' => 'serbian',
266 'swa' => 'swahili',
267 'swe' => 'swedish',
268 'tgl' => 'tagalog',
269 'tur' => 'turkish',
270 'ukr' => 'ukrainian',
271 'urd' => 'urdu',
272 'uzb' => 'uzbek',
273 'vie' => 'vietnamese',
274 );
275
276 /**
277 * Returns the 2-letter ISO 639-1 code for the given language name.
278 *
279 * @param string $lang English language name like "swedish"
280 *
281 * @return string Two-letter language code (e.g. "sv") or NULL if not found
282 */
283 public static function nameToCode2($lang)
284 {
285 $lang = strtolower($lang);
286 if (!isset(self::$nameToCode2[$lang])) {
287 return null;
288 }
289 return self::$nameToCode2[$lang];
290 }
291
292 /**
293 * Returns the 3-letter ISO 639-2 code for the given language name.
294 *
295 * @param string $lang English language name like "swedish"
296 *
297 * @return string Three-letter language code (e.g. "swe") or NULL if not found
298 */
299 public static function nameToCode3($lang)
300 {
301 $lang = strtolower($lang);
302 if (!isset(self::$nameToCode3[$lang])) {
303 return null;
304 }
305 return self::$nameToCode3[$lang];
306 }
307
308 /**
309 * Returns the language name for the given 2-letter ISO 639-1 code.
310 *
311 * @param string $code Two-letter language code (e.g. "sv")
312 *
313 * @return string English language name like "swedish"
314 */
315 public static function code2ToName($code)
316 {
317 $lang = strtolower($code);
318 if (!isset(self::$code2ToName[$code])) {
319 return null;
320 }
321 return self::$code2ToName[$code];
322 }
323
324 /**
325 * Returns the language name for the given 3-letter ISO 639-2 code.
326 *
327 * @param string $code Three-letter language code (e.g. "swe")
328 *
329 * @return string English language name like "swedish"
330 */
331 public static function code3ToName($code)
332 {
333 $lang = strtolower($code);
334 if (!isset(self::$code3ToName[$code])) {
335 return null;
336 }
337 return self::$code3ToName[$code];
338 }
339} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
index 7f15fa98..fb0e1e20 100644
--- a/inc/3rdparty/libraries/language-detect/Parser.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -8,7 +8,7 @@
8 * @author Nicholas Pisarro 8 * @author Nicholas Pisarro
9 * @copyright 2006 9 * @copyright 2006
10 * @license BSD 10 * @license BSD
11 * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ 11 * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
12 * @link http://pear.php.net/package/Text_LanguageDetect/ 12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/ 13 * @link http://langdetect.blogspot.com/
14 */ 14 */
@@ -28,7 +28,7 @@
28 * @author Nicholas Pisarro 28 * @author Nicholas Pisarro
29 * @copyright 2006 29 * @copyright 2006
30 * @license BSD 30 * @license BSD
31 * @version release: 0.2.3 31 * @version release: 0.3.0
32 */ 32 */
33class Text_LanguageDetect_Parser extends Text_LanguageDetect 33class Text_LanguageDetect_Parser extends Text_LanguageDetect
34{ 34{
@@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
102 * @access private 102 * @access private
103 * @param string $string string to be parsed 103 * @param string $string string to be parsed
104 */ 104 */
105 function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { 105 function Text_LanguageDetect_Parser($string) {
106 if (isset($db)) $this->_db_filename = $db;
107 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
108 $this->_string = $string; 106 $this->_string = $string;
109 } 107 }
110 108
111 /** 109 /**
112 * Returns true if a string is suitable for parsing 110 * Returns true if a string is suitable for parsing
113 * 111 *
114 * @static
115 * @access public
116 * @param string $str input string to test 112 * @param string $str input string to test
117 * @return bool true if acceptable, false if not 113 * @return bool true if acceptable, false if not
118 */ 114 */
119 function validateString($str) { 115 public static function validateString($str) {
120 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { 116 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
121 return true; 117 return true;
122 } else { 118 } else {
@@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
222 218
223 // unicode startup 219 // unicode startup
224 if ($this->_compile_unicode) { 220 if ($this->_compile_unicode) {
225 $blocks =& $this->_read_unicode_block_db(); 221 $blocks = $this->_read_unicode_block_db();
226
227 $block_count = count($blocks); 222 $block_count = count($blocks);
228 223
229 $skipped_count = 0; 224 $skipped_count = 0;
@@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
349 } 344 }
350} 345}
351 346
352/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 347/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
353
354?>