diff options
author | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-22 17:16:38 +0300 |
---|---|---|
committer | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-22 17:16:38 +0300 |
commit | 3ec62cf95ab4436923d4c665fad7aef226cbb822 (patch) | |
tree | f657024faaaf4c0b33ae27f7aea999f2b18cc8ab /inc/3rdparty/libraries/language-detect | |
parent | ab157bbb75ba226917145c9bf906cbf764a85cd0 (diff) | |
download | wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.tar.gz wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.tar.zst wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.zip |
update to 3.2 version of full-text-rss, issue #694
Diffstat (limited to 'inc/3rdparty/libraries/language-detect')
-rw-r--r-- | inc/3rdparty/libraries/language-detect/LanguageDetect.php | 992 |
1 files changed, 525 insertions, 467 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php index 09b11546..382d869c 100644 --- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php | |||
@@ -6,23 +6,24 @@ | |||
6 | * Attempts to detect the language of a sample of text by correlating ranked | 6 | * Attempts to detect the language of a sample of text by correlating ranked |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | 7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. |
8 | * | 8 | * |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | 9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle |
10 | * (1994): "N-Gram-Based Text Categorization" | 10 | * (1994): "N-Gram-Based Text Categorization" |
11 | * | 11 | * |
12 | * PHP versions 4 and 5 | 12 | * PHP version 5 |
13 | * | 13 | * |
14 | * @category Text | 14 | * @category Text |
15 | * @package Text_LanguageDetect | 15 | * @package Text_LanguageDetect |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
17 | * @copyright 2005-2006 Nicholas Pisarro | 17 | * @copyright 2005-2006 Nicholas Pisarro |
18 | * @license http://www.debian.org/misc/bsd.license BSD | 18 | * @license http://www.debian.org/misc/bsd.license BSD |
19 | * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ | 19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | 20 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
21 | * @link http://langdetect.blogspot.com/ | 21 | * @link http://langdetect.blogspot.com/ |
22 | */ | 22 | */ |
23 | 23 | ||
24 | //require_once 'PEAR.php'; | 24 | require_once 'LanguageDetect/Exception.php'; |
25 | require_once 'Parser.php'; | 25 | require_once 'LanguageDetect/Parser.php'; |
26 | require_once 'LanguageDetect/ISO639.php'; | ||
26 | 27 | ||
27 | /** | 28 | /** |
28 | * Language detection class | 29 | * Language detection class |
@@ -41,9 +42,10 @@ require_once 'Parser.php'; | |||
41 | * | 42 | * |
42 | * echo "Supported languages:\n"; | 43 | * echo "Supported languages:\n"; |
43 | * | 44 | * |
44 | * $langs = $l->getLanguages(); | 45 | * try { |
45 | * if (PEAR::isError($langs)) { | 46 | * $langs = $l->getLanguages(); |
46 | * die($langs->getMessage()); | 47 | * } catch (Text_LanguageDetect_Exception $e) { |
48 | * die($e->getMessage()); | ||
47 | * } | 49 | * } |
48 | * | 50 | * |
49 | * sort($langs); | 51 | * sort($langs); |
@@ -54,38 +56,38 @@ require_once 'Parser.php'; | |||
54 | * } | 56 | * } |
55 | * </code> | 57 | * </code> |
56 | * | 58 | * |
57 | * @category Text | 59 | * @category Text |
58 | * @package Text_LanguageDetect | 60 | * @package Text_LanguageDetect |
59 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
60 | * @copyright 2005 Nicholas Pisarro | 62 | * @copyright 2005 Nicholas Pisarro |
61 | * @license http://www.debian.org/misc/bsd.license BSD | 63 | * @license http://www.debian.org/misc/bsd.license BSD |
62 | * @version Release: @package_version@ | 64 | * @version Release: @package_version@ |
63 | * @todo allow users to generate their own language models | 65 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
66 | * @todo allow users to generate their own language models | ||
64 | */ | 67 | */ |
65 | |||
66 | class Text_LanguageDetect | 68 | class Text_LanguageDetect |
67 | { | 69 | { |
68 | /** | 70 | /** |
69 | * The filename that stores the trigram data for the detector | 71 | * The filename that stores the trigram data for the detector |
70 | * | 72 | * |
71 | * If this value starts with a slash (/) or a dot (.) the value of | 73 | * If this value starts with a slash (/) or a dot (.) the value of |
72 | * $this->_data_dir will be ignored | 74 | * $this->_data_dir will be ignored |
73 | * | 75 | * |
74 | * @var string | 76 | * @var string |
75 | * @access private | 77 | * @access private |
76 | */ | 78 | */ |
77 | var $_db_filename = './lang.dat'; | 79 | var $_db_filename = 'lang.dat'; |
78 | 80 | ||
79 | /** | 81 | /** |
80 | * The filename that stores the unicode block definitions | 82 | * The filename that stores the unicode block definitions |
81 | * | 83 | * |
82 | * If this value starts with a slash (/) or a dot (.) the value of | 84 | * If this value starts with a slash (/) or a dot (.) the value of |
83 | * $this->_data_dir will be ignored | 85 | * $this->_data_dir will be ignored |
84 | * | 86 | * |
85 | * @var string | 87 | * @var string |
86 | * @access private | 88 | * @access private |
87 | */ | 89 | */ |
88 | var $_unicode_db_filename = './unicode_blocks.dat'; | 90 | var $_unicode_db_filename = 'unicode_blocks.dat'; |
89 | 91 | ||
90 | /** | 92 | /** |
91 | * The data directory | 93 | * The data directory |
@@ -99,11 +101,8 @@ class Text_LanguageDetect | |||
99 | 101 | ||
100 | /** | 102 | /** |
101 | * The trigram data for comparison | 103 | * The trigram data for comparison |
102 | * | ||
103 | * Will be loaded on start from $this->_db_filename | ||
104 | * | 104 | * |
105 | * May be set to a PEAR_Error object if there is an error during its | 105 | * Will be loaded on start from $this->_db_filename |
106 | * initialization | ||
107 | * | 106 | * |
108 | * @var array | 107 | * @var array |
109 | * @access private | 108 | * @access private |
@@ -120,7 +119,7 @@ class Text_LanguageDetect | |||
120 | 119 | ||
121 | /** | 120 | /** |
122 | * The size of the trigram data arrays | 121 | * The size of the trigram data arrays |
123 | * | 122 | * |
124 | * @var int | 123 | * @var int |
125 | * @access private | 124 | * @access private |
126 | */ | 125 | */ |
@@ -140,7 +139,7 @@ class Text_LanguageDetect | |||
140 | 139 | ||
141 | /** | 140 | /** |
142 | * Whether or not to simulate perl's Language::Guess exactly | 141 | * Whether or not to simulate perl's Language::Guess exactly |
143 | * | 142 | * |
144 | * @access private | 143 | * @access private |
145 | * @var bool | 144 | * @var bool |
146 | * @see setPerlCompatible() | 145 | * @see setPerlCompatible() |
@@ -165,18 +164,24 @@ class Text_LanguageDetect | |||
165 | var $_clusters; | 164 | var $_clusters; |
166 | 165 | ||
167 | /** | 166 | /** |
167 | * Which type of "language names" are accepted and returned: | ||
168 | * | ||
169 | * 0 - language name ("english") | ||
170 | * 2 - 2-letter ISO 639-1 code ("en") | ||
171 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
172 | */ | ||
173 | var $_name_mode = 0; | ||
174 | |||
175 | /** | ||
168 | * Constructor | 176 | * Constructor |
169 | * | 177 | * |
170 | * Will attempt to load the language database. If it fails, you will get | 178 | * Will attempt to load the language database. If it fails, you will get |
171 | * a PEAR_Error object returned when you try to use detect() | 179 | * an exception. |
172 | * | ||
173 | */ | 180 | */ |
174 | function Text_LanguageDetect($db=null, $unicode_db=null) | 181 | function __construct() |
175 | { | 182 | { |
176 | if (isset($db)) $this->_db_filename = $db; | ||
177 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
178 | |||
179 | $data = $this->_readdb($this->_db_filename); | 183 | $data = $this->_readdb($this->_db_filename); |
184 | $this->_checkTrigram($data['trigram']); | ||
180 | $this->_lang_db = $data['trigram']; | 185 | $this->_lang_db = $data['trigram']; |
181 | 186 | ||
182 | if (isset($data['trigram-unicodemap'])) { | 187 | if (isset($data['trigram-unicodemap'])) { |
@@ -186,29 +191,32 @@ class Text_LanguageDetect | |||
186 | // Not yet implemented: | 191 | // Not yet implemented: |
187 | if (isset($data['trigram-clusters'])) { | 192 | if (isset($data['trigram-clusters'])) { |
188 | $this->_clusters = $data['trigram-clusters']; | 193 | $this->_clusters = $data['trigram-clusters']; |
189 | } | 194 | } |
190 | } | 195 | } |
191 | 196 | ||
192 | /** | 197 | /** |
193 | * Returns the path to the location of the database | 198 | * Returns the path to the location of the database |
194 | * | 199 | * |
195 | * @access private | 200 | * @param string $fname File name to load |
196 | * @return string expected path to the language model database | 201 | * |
202 | * @return string expected path to the language model database | ||
203 | * @access private | ||
197 | */ | 204 | */ |
198 | function _get_data_loc($fname) | 205 | function _get_data_loc($fname) |
199 | { | 206 | { |
200 | return $fname; | 207 | return dirname(__FILE__).'/'.$fname; |
201 | } | 208 | } |
202 | 209 | ||
203 | /** | 210 | /** |
204 | * Loads the language trigram database from filename | 211 | * Loads the language trigram database from filename |
205 | * | 212 | * |
206 | * Trigram datbase should be a serialize()'d array | 213 | * Trigram datbase should be a serialize()'d array |
207 | * | 214 | * |
208 | * @access private | 215 | * @param string $fname the filename where the data is stored |
209 | * @param string $fname the filename where the data is stored | 216 | * |
210 | * @return array the language model data | 217 | * @return array the language model data |
211 | * @throws PEAR_Error | 218 | * @throws Text_LanguageDetect_Exception |
219 | * @access private | ||
212 | */ | 220 | */ |
213 | function _readdb($fname) | 221 | function _readdb($fname) |
214 | { | 222 | { |
@@ -217,79 +225,74 @@ class Text_LanguageDetect | |||
217 | 225 | ||
218 | // input check | 226 | // input check |
219 | if (!file_exists($fname)) { | 227 | if (!file_exists($fname)) { |
220 | throw new Exception('Language database does not exist.'); | 228 | throw new Text_LanguageDetect_Exception( |
229 | 'Language database does not exist: ' . $fname, | ||
230 | Text_LanguageDetect_Exception::DB_NOT_FOUND | ||
231 | ); | ||
221 | } elseif (!is_readable($fname)) { | 232 | } elseif (!is_readable($fname)) { |
222 | throw new Exception('Language database is not readable.'); | 233 | throw new Text_LanguageDetect_Exception( |
234 | 'Language database is not readable: ' . $fname, | ||
235 | Text_LanguageDetect_Exception::DB_NOT_READABLE | ||
236 | ); | ||
223 | } | 237 | } |
224 | 238 | ||
225 | if (function_exists('file_get_contents')) { | 239 | return unserialize(file_get_contents($fname)); |
226 | return unserialize(file_get_contents($fname)); | ||
227 | } else { | ||
228 | // if you don't have file_get_contents(), | ||
229 | // then this is the next fastest way | ||
230 | ob_start(); | ||
231 | readfile($fname); | ||
232 | $contents = ob_get_contents(); | ||
233 | ob_end_clean(); | ||
234 | return unserialize($contents); | ||
235 | } | ||
236 | } | 240 | } |
237 | 241 | ||
238 | 242 | ||
239 | /** | 243 | /** |
240 | * Checks if this object is ready to detect languages | 244 | * Checks if this object is ready to detect languages |
241 | * | 245 | * |
242 | * @access private | 246 | * @param array $trigram Trigram data from database |
243 | * @param mixed &$err error object to be returned by reference, if any | 247 | * |
244 | * @return bool true if no errors | 248 | * @return void |
249 | * @access private | ||
245 | */ | 250 | */ |
246 | function _setup_ok(&$err) | 251 | function _checkTrigram($trigram) |
247 | { | 252 | { |
248 | if (!is_array($this->_lang_db)) { | 253 | if (!is_array($trigram)) { |
249 | if (ini_get('magic_quotes_runtime')) { | 254 | if (ini_get('magic_quotes_runtime')) { |
250 | throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); | 255 | throw new Text_LanguageDetect_Exception( |
251 | } else { | 256 | 'Error loading database. Try turning magic_quotes_runtime off.', |
252 | throw new Exception('Language database is not an array.'); | 257 | Text_LanguageDetect_Exception::MAGIC_QUOTES |
258 | ); | ||
253 | } | 259 | } |
254 | return false; | 260 | throw new Text_LanguageDetect_Exception( |
255 | 261 | 'Language database is not an array.', | |
256 | } elseif (empty($this->_lang_db)) { | 262 | Text_LanguageDetect_Exception::DB_NOT_ARRAY |
257 | throw new Exception('Language database has no elements.'); | 263 | ); |
258 | return false; | 264 | } elseif (empty($trigram)) { |
259 | 265 | throw new Text_LanguageDetect_Exception( | |
260 | } else { | 266 | 'Language database has no elements.', |
261 | return true; | 267 | Text_LanguageDetect_Exception::DB_EMPTY |
268 | ); | ||
262 | } | 269 | } |
263 | } | 270 | } |
264 | 271 | ||
265 | /** | 272 | /** |
266 | * Omits languages | 273 | * Omits languages |
267 | * | 274 | * |
268 | * Pass this function the name of or an array of names of | 275 | * Pass this function the name of or an array of names of |
269 | * languages that you don't want considered | 276 | * languages that you don't want considered |
270 | * | 277 | * |
271 | * If you're only expecting a limited set of languages, this can greatly | 278 | * If you're only expecting a limited set of languages, this can greatly |
272 | * speed up processing | 279 | * speed up processing |
273 | * | 280 | * |
274 | * @access public | 281 | * @param mixed $omit_list language name or array of names to omit |
275 | * @param mixed $omit_list language name or array of names to omit | 282 | * @param bool $include_only if true will include (rather than |
276 | * @param bool $include_only if true will include (rather than | 283 | * exclude) only those in the list |
277 | * exclude) only those in the list | 284 | * |
278 | * @return int number of languages successfully deleted | 285 | * @return int number of languages successfully deleted |
279 | * @throws PEAR_Error | 286 | * @throws Text_LanguageDetect_Exception |
280 | */ | 287 | */ |
281 | function omitLanguages($omit_list, $include_only = false) | 288 | public function omitLanguages($omit_list, $include_only = false) |
282 | { | 289 | { |
283 | |||
284 | // setup check | ||
285 | if (!$this->_setup_ok($err)) { | ||
286 | return $err; | ||
287 | } | ||
288 | |||
289 | $deleted = 0; | 290 | $deleted = 0; |
290 | 291 | ||
291 | // deleting the given languages | 292 | $omit_list = $this->_convertFromNameMode($omit_list); |
293 | |||
292 | if (!$include_only) { | 294 | if (!$include_only) { |
295 | // deleting the given languages | ||
293 | if (!is_array($omit_list)) { | 296 | if (!is_array($omit_list)) { |
294 | $omit_list = strtolower($omit_list); // case desensitize | 297 | $omit_list = strtolower($omit_list); // case desensitize |
295 | if (isset($this->_lang_db[$omit_list])) { | 298 | if (isset($this->_lang_db[$omit_list])) { |
@@ -301,12 +304,12 @@ class Text_LanguageDetect | |||
301 | if (isset($this->_lang_db[$omit_lang])) { | 304 | if (isset($this->_lang_db[$omit_lang])) { |
302 | unset($this->_lang_db[$omit_lang]); | 305 | unset($this->_lang_db[$omit_lang]); |
303 | $deleted++; | 306 | $deleted++; |
304 | } | 307 | } |
305 | } | 308 | } |
306 | } | 309 | } |
307 | 310 | ||
308 | // deleting all except the given languages | ||
309 | } else { | 311 | } else { |
312 | // deleting all except the given languages | ||
310 | if (!is_array($omit_list)) { | 313 | if (!is_array($omit_list)) { |
311 | $omit_list = array($omit_list); | 314 | $omit_list = array($omit_list); |
312 | } | 315 | } |
@@ -327,7 +330,7 @@ class Text_LanguageDetect | |||
327 | // reset the cluster cache if the number of languages changes | 330 | // reset the cluster cache if the number of languages changes |
328 | // this will then have to be recalculated | 331 | // this will then have to be recalculated |
329 | if (isset($this->_clusters) && $deleted > 0) { | 332 | if (isset($this->_clusters) && $deleted > 0) { |
330 | unset($this->_clusters); | 333 | $this->_clusters = null; |
331 | } | 334 | } |
332 | 335 | ||
333 | return $deleted; | 336 | return $deleted; |
@@ -339,49 +342,40 @@ class Text_LanguageDetect | |||
339 | * | 342 | * |
340 | * @access public | 343 | * @access public |
341 | * @return int the number of languages | 344 | * @return int the number of languages |
342 | * @throws PEAR_Error | 345 | * @throws Text_LanguageDetect_Exception |
343 | */ | 346 | */ |
344 | function getLanguageCount() | 347 | function getLanguageCount() |
345 | { | 348 | { |
346 | if (!$this->_setup_ok($err)) { | 349 | return count($this->_lang_db); |
347 | return $err; | ||
348 | } else { | ||
349 | return count($this->_lang_db); | ||
350 | } | ||
351 | } | 350 | } |
352 | 351 | ||
353 | /** | 352 | /** |
354 | * Returns true if a given language exists | 353 | * Checks if the language with the given name exists in the database |
355 | * | 354 | * |
356 | * If passed an array of names, will return true only if all exist | 355 | * @param mixed $lang Language name or array of language names |
357 | * | 356 | * |
358 | * @access public | 357 | * @return bool true if language model exists |
359 | * @param mixed $lang language name or array of language names | ||
360 | * @return bool true if language model exists | ||
361 | * @throws PEAR_Error | ||
362 | */ | 358 | */ |
363 | function languageExists($lang) | 359 | public function languageExists($lang) |
364 | { | 360 | { |
365 | if (!$this->_setup_ok($err)) { | 361 | $lang = $this->_convertFromNameMode($lang); |
366 | return $err; | ||
367 | } else { | ||
368 | // string | ||
369 | if (is_string($lang)) { | ||
370 | return isset($this->_lang_db[strtolower($lang)]); | ||
371 | |||
372 | // array | ||
373 | } elseif (is_array($lang)) { | ||
374 | foreach ($lang as $test_lang) { | ||
375 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
376 | return false; | ||
377 | } | ||
378 | } | ||
379 | return true; | ||
380 | 362 | ||
381 | // other (error) | 363 | if (is_string($lang)) { |
382 | } else { | 364 | return isset($this->_lang_db[strtolower($lang)]); |
383 | throw new Exception('Unknown type passed to languageExists()'); | 365 | |
366 | } elseif (is_array($lang)) { | ||
367 | foreach ($lang as $test_lang) { | ||
368 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
369 | return false; | ||
370 | } | ||
384 | } | 371 | } |
372 | return true; | ||
373 | |||
374 | } else { | ||
375 | throw new Text_LanguageDetect_Exception( | ||
376 | 'Unsupported parameter type passed to languageExists()', | ||
377 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
378 | ); | ||
385 | } | 379 | } |
386 | } | 380 | } |
387 | 381 | ||
@@ -389,25 +383,24 @@ class Text_LanguageDetect | |||
389 | * Returns the list of detectable languages | 383 | * Returns the list of detectable languages |
390 | * | 384 | * |
391 | * @access public | 385 | * @access public |
392 | * @return array the names of the languages known to this object | 386 | * @return array the names of the languages known to this object<<<<<<< |
393 | * @throws PEAR_Error | 387 | * @throws Text_LanguageDetect_Exception |
394 | */ | 388 | */ |
395 | function getLanguages() | 389 | function getLanguages() |
396 | { | 390 | { |
397 | if (!$this->_setup_ok($err)) { | 391 | return $this->_convertToNameMode( |
398 | return $err; | 392 | array_keys($this->_lang_db) |
399 | } else { | 393 | ); |
400 | return array_keys($this->_lang_db); | ||
401 | } | ||
402 | } | 394 | } |
403 | 395 | ||
404 | /** | 396 | /** |
405 | * Make this object behave like Language::Guess | 397 | * Make this object behave like Language::Guess |
406 | * | 398 | * |
407 | * @access public | 399 | * @param bool $setting false to turn off perl compatibility |
408 | * @param bool $setting false to turn off perl compatibility | 400 | * |
401 | * @return void | ||
409 | */ | 402 | */ |
410 | function setPerlCompatible($setting = true) | 403 | public function setPerlCompatible($setting = true) |
411 | { | 404 | { |
412 | if (is_bool($setting)) { // input check | 405 | if (is_bool($setting)) { // input check |
413 | $this->_perl_compatible = $setting; | 406 | $this->_perl_compatible = $setting; |
@@ -422,6 +415,21 @@ class Text_LanguageDetect | |||
422 | } | 415 | } |
423 | 416 | ||
424 | /** | 417 | /** |
418 | * Sets the way how language names are accepted and returned. | ||
419 | * | ||
420 | * @param integer $name_mode One of the following modes: | ||
421 | * 0 - language name ("english") | ||
422 | * 2 - 2-letter ISO 639-1 code ("en") | ||
423 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
424 | * | ||
425 | * @return void | ||
426 | */ | ||
427 | function setNameMode($name_mode) | ||
428 | { | ||
429 | $this->_name_mode = $name_mode; | ||
430 | } | ||
431 | |||
432 | /** | ||
425 | * Whether to use unicode block ranges in detection | 433 | * Whether to use unicode block ranges in detection |
426 | * | 434 | * |
427 | * Should speed up most detections if turned on (detault is on). In some | 435 | * Should speed up most detections if turned on (detault is on). In some |
@@ -429,10 +437,11 @@ class Text_LanguageDetect | |||
429 | * in languages that use latin scripts. In other cases it should speed up | 437 | * in languages that use latin scripts. In other cases it should speed up |
430 | * detection noticeably. | 438 | * detection noticeably. |
431 | * | 439 | * |
432 | * @access public | 440 | * @param bool $setting false to turn off |
433 | * @param bool $setting false to turn off | 441 | * |
442 | * @return void | ||
434 | */ | 443 | */ |
435 | function useUnicodeBlocks($setting = true) | 444 | public function useUnicodeBlocks($setting = true) |
436 | { | 445 | { |
437 | if (is_bool($setting)) { | 446 | if (is_bool($setting)) { |
438 | $this->_use_unicode_narrowing = $setting; | 447 | $this->_use_unicode_narrowing = $setting; |
@@ -442,15 +451,15 @@ class Text_LanguageDetect | |||
442 | /** | 451 | /** |
443 | * Converts a piece of text into trigrams | 452 | * Converts a piece of text into trigrams |
444 | * | 453 | * |
445 | * Superceded by the Text_LanguageDetect_Parser class | 454 | * @param string $text text to convert |
446 | * | 455 | * |
447 | * @access private | 456 | * @return array array of trigram frequencies |
448 | * @param string $text text to convert | 457 | * @access private |
449 | * @return array array of trigram frequencies | 458 | * @deprecated Superceded by the Text_LanguageDetect_Parser class |
450 | */ | 459 | */ |
451 | function _trigram($text) | 460 | function _trigram($text) |
452 | { | 461 | { |
453 | $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); | 462 | $s = new Text_LanguageDetect_Parser($text); |
454 | $s->prepareTrigram(); | 463 | $s->prepareTrigram(); |
455 | $s->prepareUnicode(false); | 464 | $s->prepareUnicode(false); |
456 | $s->setPadStart(!$this->_perl_compatible); | 465 | $s->setPadStart(!$this->_perl_compatible); |
@@ -463,11 +472,12 @@ class Text_LanguageDetect | |||
463 | * | 472 | * |
464 | * Thresholds (cuts off) the list at $this->_threshold | 473 | * Thresholds (cuts off) the list at $this->_threshold |
465 | * | 474 | * |
466 | * @access protected | 475 | * @param array $arr array of trigram |
467 | * @param array $arr array of trgram | 476 | * |
468 | * @return array ranks of trigrams | 477 | * @return array ranks of trigrams |
478 | * @access protected | ||
469 | */ | 479 | */ |
470 | function _arr_rank(&$arr) | 480 | function _arr_rank($arr) |
471 | { | 481 | { |
472 | 482 | ||
473 | // sorts alphabetically first as a standard way of breaking rank ties | 483 | // sorts alphabetically first as a standard way of breaking rank ties |
@@ -494,14 +504,17 @@ class Text_LanguageDetect | |||
494 | 504 | ||
495 | /** | 505 | /** |
496 | * Sorts an array by value breaking ties alphabetically | 506 | * Sorts an array by value breaking ties alphabetically |
497 | * | 507 | * |
498 | * @access private | 508 | * @param array &$arr the array to sort |
499 | * @param array &$arr the array to sort | 509 | * |
510 | * @return void | ||
511 | * @access private | ||
500 | */ | 512 | */ |
501 | function _bub_sort(&$arr) | 513 | function _bub_sort(&$arr) |
502 | { | 514 | { |
503 | // should do the same as this perl statement: | 515 | // should do the same as this perl statement: |
504 | // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | 516 | // sort { $trigrams{$b} == $trigrams{$a} |
517 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | ||
505 | 518 | ||
506 | // needs to sort by both key and value at once | 519 | // needs to sort by both key and value at once |
507 | // using the key to break ties for the value | 520 | // using the key to break ties for the value |
@@ -528,13 +541,14 @@ class Text_LanguageDetect | |||
528 | /** | 541 | /** |
529 | * Sort function used by bubble sort | 542 | * Sort function used by bubble sort |
530 | * | 543 | * |
531 | * Callback function for usort(). | 544 | * Callback function for usort(). |
532 | * | 545 | * |
533 | * @access private | 546 | * @param array $a first param passed by usort() |
534 | * @param array first param passed by usort() | 547 | * @param array $b second param passed by usort() |
535 | * @param array second param passed by usort() | 548 | * |
536 | * @return int 1 if $a is greater, -1 if not | 549 | * @return int 1 if $a is greater, -1 if not |
537 | * @see _bub_sort() | 550 | * @see _bub_sort() |
551 | * @access private | ||
538 | */ | 552 | */ |
539 | function _sort_func($a, $b) | 553 | function _sort_func($a, $b) |
540 | { | 554 | { |
@@ -542,12 +556,12 @@ class Text_LanguageDetect | |||
542 | list($a_key, $a_value) = $a; | 556 | list($a_key, $a_value) = $a; |
543 | list($b_key, $b_value) = $b; | 557 | list($b_key, $b_value) = $b; |
544 | 558 | ||
545 | // if the values are the same, break ties using the key | ||
546 | if ($a_value == $b_value) { | 559 | if ($a_value == $b_value) { |
560 | // if the values are the same, break ties using the key | ||
547 | return strcmp($a_key, $b_key); | 561 | return strcmp($a_key, $b_key); |
548 | 562 | ||
549 | // if not, just sort normally | ||
550 | } else { | 563 | } else { |
564 | // if not, just sort normally | ||
551 | if ($a_value > $b_value) { | 565 | if ($a_value > $b_value) { |
552 | return -1; | 566 | return -1; |
553 | } else { | 567 | } else { |
@@ -559,23 +573,24 @@ class Text_LanguageDetect | |||
559 | } | 573 | } |
560 | 574 | ||
561 | /** | 575 | /** |
562 | * Calculates a linear rank-order distance statistic between two sets of | 576 | * Calculates a linear rank-order distance statistic between two sets of |
563 | * ranked trigrams | 577 | * ranked trigrams |
564 | * | 578 | * |
565 | * Sums the differences in rank for each trigram. If the trigram does not | 579 | * Sums the differences in rank for each trigram. If the trigram does not |
566 | * appear in both, consider it a difference of $this->_threshold. | 580 | * appear in both, consider it a difference of $this->_threshold. |
567 | * | 581 | * |
568 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | 582 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite |
569 | * its simplicity it has been shown to be highly accurate for language | 583 | * its simplicity it has been shown to be highly accurate for language |
570 | * identification tasks. | 584 | * identification tasks. |
571 | * | 585 | * |
572 | * @access private | 586 | * @param array $arr1 the reference set of trigram ranks |
573 | * @param array $arr1 the reference set of trigram ranks | 587 | * @param array $arr2 the target set of trigram ranks |
574 | * @param array $arr2 the target set of trigram ranks | 588 | * |
575 | * @return int the sum of the differences between the ranks of | 589 | * @return int the sum of the differences between the ranks of |
576 | * the two trigram sets | 590 | * the two trigram sets |
591 | * @access private | ||
577 | */ | 592 | */ |
578 | function _distance(&$arr1, &$arr2) | 593 | function _distance($arr1, $arr2) |
579 | { | 594 | { |
580 | $sumdist = 0; | 595 | $sumdist = 0; |
581 | 596 | ||
@@ -598,14 +613,15 @@ class Text_LanguageDetect | |||
598 | 613 | ||
599 | /** | 614 | /** |
600 | * Normalizes the score returned by _distance() | 615 | * Normalizes the score returned by _distance() |
601 | * | 616 | * |
602 | * Different if perl compatible or not | 617 | * Different if perl compatible or not |
603 | * | 618 | * |
604 | * @access private | 619 | * @param int $score the score from _distance() |
605 | * @param int $score the score from _distance() | 620 | * @param int $base_count the number of trigrams being considered |
606 | * @param int $base_count the number of trigrams being considered | 621 | * |
607 | * @return float the normalized score | 622 | * @return float the normalized score |
608 | * @see _distance() | 623 | * @see _distance() |
624 | * @access private | ||
609 | */ | 625 | */ |
610 | function _normalize_score($score, $base_count = null) | 626 | function _normalize_score($score, $base_count = null) |
611 | { | 627 | { |
@@ -630,29 +646,24 @@ class Text_LanguageDetect | |||
630 | * | 646 | * |
631 | * If perl compatible, the score is 300-0, 0 being most similar. | 647 | * If perl compatible, the score is 300-0, 0 being most similar. |
632 | * Otherwise, it's 0-1 with 1 being most similar. | 648 | * Otherwise, it's 0-1 with 1 being most similar. |
633 | * | 649 | * |
634 | * The $sample text should be at least a few sentences in length; | 650 | * The $sample text should be at least a few sentences in length; |
635 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | 651 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension |
636 | * is present it will try to detect and convert. However, experience has | 652 | * is present it will try to detect and convert. However, experience has |
637 | * shown that mb_detect_encoding() *does not work very well* with at least | 653 | * shown that mb_detect_encoding() *does not work very well* with at least |
638 | * some types of encoding. | 654 | * some types of encoding. |
639 | * | 655 | * |
640 | * @access public | 656 | * @param string $sample a sample of text to compare. |
641 | * @param string $sample a sample of text to compare. | 657 | * @param int $limit if specified, return an array of the most likely |
642 | * @param int $limit if specified, return an array of the most likely | 658 | * $limit languages and their scores. |
643 | * $limit languages and their scores. | 659 | * |
644 | * @return mixed sorted array of language scores, blank array if no | 660 | * @return mixed sorted array of language scores, blank array if no |
645 | * useable text was found, or PEAR_Error if error | 661 | * useable text was found |
646 | * with the object setup | 662 | * @see _distance() |
647 | * @see _distance() | 663 | * @throws Text_LanguageDetect_Exception |
648 | * @throws PEAR_Error | ||
649 | */ | 664 | */ |
650 | function detect($sample, $limit = 0) | 665 | public function detect($sample, $limit = 0) |
651 | { | 666 | { |
652 | if (!$this->_setup_ok($err)) { | ||
653 | return $err; | ||
654 | } | ||
655 | |||
656 | // input check | 667 | // input check |
657 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | 668 | if (!Text_LanguageDetect_Parser::validateString($sample)) { |
658 | return array(); | 669 | return array(); |
@@ -660,36 +671,27 @@ class Text_LanguageDetect | |||
660 | 671 | ||
661 | // check char encoding | 672 | // check char encoding |
662 | // (only if mbstring extension is compiled and PHP > 4.0.6) | 673 | // (only if mbstring extension is compiled and PHP > 4.0.6) |
663 | if (function_exists('mb_detect_encoding') | 674 | if (function_exists('mb_detect_encoding') |
664 | && function_exists('mb_convert_encoding')) { | 675 | && function_exists('mb_convert_encoding') |
665 | 676 | ) { | |
666 | // mb_detect_encoding isn't very reliable, to say the least | 677 | // mb_detect_encoding isn't very reliable, to say the least |
667 | // detection should still work with a sufficient sample of ascii characters | 678 | // detection should still work with a sufficient sample |
679 | // of ascii characters | ||
668 | $encoding = mb_detect_encoding($sample); | 680 | $encoding = mb_detect_encoding($sample); |
669 | 681 | ||
670 | // mb_detect_encoding() will return FALSE if detection fails | 682 | // mb_detect_encoding() will return FALSE if detection fails |
671 | // don't attempt conversion if that's the case | 683 | // don't attempt conversion if that's the case |
672 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { | 684 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' |
673 | 685 | && $encoding !== false | |
674 | if (function_exists('mb_list_encodings')) { | 686 | ) { |
675 | 687 | // verify the encoding exists in mb_list_encodings | |
676 | // verify the encoding exists in mb_list_encodings | 688 | if (in_array($encoding, mb_list_encodings())) { |
677 | if (in_array($encoding, mb_list_encodings())) { | 689 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); |
678 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
679 | } | ||
680 | |||
681 | // if the previous condition failed: | ||
682 | // somehow we detected an encoding that also we don't support | ||
683 | |||
684 | } else { | ||
685 | // php 4 doesnt have mb_list_encodings() | ||
686 | // so attempt with error suppression | ||
687 | $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
688 | } | 690 | } |
689 | } | 691 | } |
690 | } | 692 | } |
691 | 693 | ||
692 | $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); | 694 | $sample_obj = new Text_LanguageDetect_Parser($sample); |
693 | $sample_obj->prepareTrigram(); | 695 | $sample_obj->prepareTrigram(); |
694 | if ($this->_use_unicode_narrowing) { | 696 | if ($this->_use_unicode_narrowing) { |
695 | $sample_obj->prepareUnicode(); | 697 | $sample_obj->prepareUnicode(); |
@@ -713,7 +715,10 @@ class Text_LanguageDetect | |||
713 | if (is_array($blocks)) { | 715 | if (is_array($blocks)) { |
714 | $present_blocks = array_keys($blocks); | 716 | $present_blocks = array_keys($blocks); |
715 | } else { | 717 | } else { |
716 | throw new Exception('Error during block detection'); | 718 | throw new Text_LanguageDetect_Exception( |
719 | 'Error during block detection', | ||
720 | Text_LanguageDetect_Exception::BLOCK_DETECTION | ||
721 | ); | ||
717 | } | 722 | } |
718 | 723 | ||
719 | $possible_langs = array(); | 724 | $possible_langs = array(); |
@@ -731,30 +736,30 @@ class Text_LanguageDetect | |||
731 | } | 736 | } |
732 | 737 | ||
733 | // could also try an intersect operation rather than a union | 738 | // could also try an intersect operation rather than a union |
734 | // in other words, choose languages whose trigrams contain | 739 | // in other words, choose languages whose trigrams contain |
735 | // ALL of the unicode blocks found in this sample | 740 | // ALL of the unicode blocks found in this sample |
736 | // would improve speed but would be completely thrown off by an | 741 | // would improve speed but would be completely thrown off by an |
737 | // unexpected character, like an umlaut appearing in english text | 742 | // unexpected character, like an umlaut appearing in english text |
738 | 743 | ||
739 | $possible_langs = array_intersect( | 744 | $possible_langs = array_intersect( |
740 | array_keys($this->_lang_db), | 745 | array_keys($this->_lang_db), |
741 | array_unique($possible_langs) | 746 | array_unique($possible_langs) |
742 | ); | 747 | ); |
743 | 748 | ||
744 | // needs to intersect it with the keys of _lang_db in case | 749 | // needs to intersect it with the keys of _lang_db in case |
745 | // languages have been omitted | 750 | // languages have been omitted |
746 | 751 | ||
747 | // or just try 'em all | ||
748 | } else { | 752 | } else { |
753 | // or just try 'em all | ||
749 | $possible_langs = array_keys($this->_lang_db); | 754 | $possible_langs = array_keys($this->_lang_db); |
750 | } | 755 | } |
751 | 756 | ||
752 | 757 | ||
753 | foreach ($possible_langs as $lang) { | 758 | foreach ($possible_langs as $lang) { |
754 | $scores[$lang] = | 759 | $scores[$lang] = $this->_normalize_score( |
755 | $this->_normalize_score( | 760 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), |
756 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | 761 | $trigram_count |
757 | $trigram_count); | 762 | ); |
758 | } | 763 | } |
759 | 764 | ||
760 | unset($sample_obj); | 765 | unset($sample_obj); |
@@ -772,7 +777,6 @@ class Text_LanguageDetect | |||
772 | $limited_scores = array(); | 777 | $limited_scores = array(); |
773 | 778 | ||
774 | $i = 0; | 779 | $i = 0; |
775 | |||
776 | foreach ($scores as $key => $value) { | 780 | foreach ($scores as $key => $value) { |
777 | if ($i++ >= $limit) { | 781 | if ($i++ >= $limit) { |
778 | break; | 782 | break; |
@@ -781,9 +785,9 @@ class Text_LanguageDetect | |||
781 | $limited_scores[$key] = $value; | 785 | $limited_scores[$key] = $value; |
782 | } | 786 | } |
783 | 787 | ||
784 | return $limited_scores; | 788 | return $this->_convertToNameMode($limited_scores, true); |
785 | } else { | 789 | } else { |
786 | return $scores; | 790 | return $this->_convertToNameMode($scores, true); |
787 | } | 791 | } |
788 | } | 792 | } |
789 | 793 | ||
@@ -791,35 +795,33 @@ class Text_LanguageDetect | |||
791 | * Returns only the most similar language to the text sample | 795 | * Returns only the most similar language to the text sample |
792 | * | 796 | * |
793 | * Calls $this->detect() and returns only the top result | 797 | * Calls $this->detect() and returns only the top result |
794 | * | 798 | * |
795 | * @access public | 799 | * @param string $sample text to detect the language of |
796 | * @param string $sample text to detect the language of | 800 | * |
797 | * @return string the name of the most likely language | 801 | * @return string the name of the most likely language |
798 | * or null if no language is similar | 802 | * or null if no language is similar |
799 | * @see detect() | 803 | * @see detect() |
800 | * @throws PEAR_Error | 804 | * @throws Text_LanguageDetect_Exception |
801 | */ | 805 | */ |
802 | function detectSimple($sample) | 806 | public function detectSimple($sample) |
803 | { | 807 | { |
804 | $scores = $this->detect($sample, 1); | 808 | $scores = $this->detect($sample, 1); |
805 | 809 | ||
806 | // if top language has the maximum possible score, | 810 | // if top language has the maximum possible score, |
807 | // then the top score will have been picked at random | 811 | // then the top score will have been picked at random |
808 | if ( !is_array($scores) | 812 | if (!is_array($scores) || empty($scores) |
809 | || empty($scores) | 813 | || current($scores) == $this->_max_score |
810 | || current($scores) == $this->_max_score) { | 814 | ) { |
811 | |||
812 | return null; | 815 | return null; |
813 | |||
814 | } else { | 816 | } else { |
815 | return ucfirst(key($scores)); | 817 | return key($scores); |
816 | } | 818 | } |
817 | } | 819 | } |
818 | 820 | ||
819 | /** | 821 | /** |
820 | * Returns an array containing the most similar language and a confidence | 822 | * Returns an array containing the most similar language and a confidence |
821 | * rating | 823 | * rating |
822 | * | 824 | * |
823 | * Confidence is a simple measure calculated from the similarity score | 825 | * Confidence is a simple measure calculated from the similarity score |
824 | * minus the similarity score from the next most similar language | 826 | * minus the similarity score from the next most similar language |
825 | * divided by the highest possible score. Languages that have closely | 827 | * divided by the highest possible score. Languages that have closely |
@@ -827,46 +829,43 @@ class Text_LanguageDetect | |||
827 | * confidence scores. | 829 | * confidence scores. |
828 | * | 830 | * |
829 | * The similarity score answers the question "How likely is the text the | 831 | * The similarity score answers the question "How likely is the text the |
830 | * returned language regardless of the other languages considered?" The | 832 | * returned language regardless of the other languages considered?" The |
831 | * confidence score is one way of answering the question "how likely is the | 833 | * confidence score is one way of answering the question "how likely is the |
832 | * text the detected language relative to the rest of the language model | 834 | * text the detected language relative to the rest of the language model |
833 | * set?" | 835 | * set?" |
834 | * | 836 | * |
835 | * To see how similar languages are a priori, see languageSimilarity() | 837 | * To see how similar languages are a priori, see languageSimilarity() |
836 | * | 838 | * |
837 | * @access public | 839 | * @param string $sample text for which language will be detected |
838 | * @param string $sample text for which language will be detected | 840 | * |
839 | * @return array most similar language, score and confidence rating | 841 | * @return array most similar language, score and confidence rating |
840 | * or null if no language is similar | 842 | * or null if no language is similar |
841 | * @see detect() | 843 | * @see detect() |
842 | * @throws PEAR_Error | 844 | * @throws Text_LanguageDetect_Exception |
843 | */ | 845 | */ |
844 | function detectConfidence($sample) | 846 | public function detectConfidence($sample) |
845 | { | 847 | { |
846 | $scores = $this->detect($sample, 2); | 848 | $scores = $this->detect($sample, 2); |
847 | 849 | ||
848 | // if most similar language has the max score, it | 850 | // if most similar language has the max score, it |
849 | // will have been picked at random | 851 | // will have been picked at random |
850 | if ( !is_array($scores) | 852 | if (!is_array($scores) || empty($scores) |
851 | || empty($scores) | 853 | || current($scores) == $this->_max_score |
852 | || current($scores) == $this->_max_score) { | 854 | ) { |
853 | |||
854 | return null; | 855 | return null; |
855 | } | 856 | } |
856 | 857 | ||
857 | $arr['language'] = ucfirst(key($scores)); | 858 | $arr['language'] = key($scores); |
858 | $arr['similarity'] = current($scores); | 859 | $arr['similarity'] = current($scores); |
859 | if (next($scores) !== false) { // if false then no next element | 860 | if (next($scores) !== false) { // if false then no next element |
860 | // the goal is to return a higher value if the distance between | 861 | // the goal is to return a higher value if the distance between |
861 | // the similarity of the first score and the second score is high | 862 | // the similarity of the first score and the second score is high |
862 | 863 | ||
863 | if ($this->_perl_compatible) { | 864 | if ($this->_perl_compatible) { |
864 | 865 | $arr['confidence'] = (current($scores) - $arr['similarity']) | |
865 | $arr['confidence'] = | 866 | / $this->_max_score; |
866 | (current($scores) - $arr['similarity']) / $this->_max_score; | ||
867 | 867 | ||
868 | } else { | 868 | } else { |
869 | |||
870 | $arr['confidence'] = $arr['similarity'] - current($scores); | 869 | $arr['confidence'] = $arr['similarity'] - current($scores); |
871 | 870 | ||
872 | } | 871 | } |
@@ -882,32 +881,26 @@ class Text_LanguageDetect | |||
882 | * Returns the distribution of unicode blocks in a given utf8 string | 881 | * Returns the distribution of unicode blocks in a given utf8 string |
883 | * | 882 | * |
884 | * For the block name of a single char, use unicodeBlockName() | 883 | * For the block name of a single char, use unicodeBlockName() |
885 | * | 884 | * |
886 | * @access public | 885 | * @param string $str input string. Must be ascii or utf8 |
887 | * @param string $str input string. Must be ascii or utf8 | 886 | * @param bool $skip_symbols if true, skip ascii digits, symbols and |
888 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | 887 | * non-printing characters. Includes spaces, |
889 | * non-printing characters. Includes spaces, | 888 | * newlines and common punctutation characters. |
890 | * newlines and common punctutation characters. | 889 | * |
891 | * @return array | 890 | * @return array |
892 | * @throws PEAR_Error | 891 | * @throws Text_LanguageDetect_Exception |
893 | */ | 892 | */ |
894 | function detectUnicodeBlocks($str, $skip_symbols) | 893 | public function detectUnicodeBlocks($str, $skip_symbols) |
895 | { | 894 | { |
896 | // input check | 895 | $skip_symbols = (bool)$skip_symbols; |
897 | if (!is_bool($skip_symbols)) { | 896 | $str = (string)$str; |
898 | throw new Exception('Second parameter must be boolean'); | ||
899 | } | ||
900 | |||
901 | if (!is_string($str)) { | ||
902 | throw new Exception('First parameter was not a string'); | ||
903 | } | ||
904 | 897 | ||
905 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 898 | $sample_obj = new Text_LanguageDetect_Parser($str); |
906 | $sample_obj->prepareUnicode(); | 899 | $sample_obj->prepareUnicode(); |
907 | $sample_obj->prepareTrigram(false); | 900 | $sample_obj->prepareTrigram(false); |
908 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | 901 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); |
909 | $sample_obj->analyze(); | 902 | $sample_obj->analyze(); |
910 | $blocks =& $sample_obj->getUnicodeBlocks(); | 903 | $blocks = $sample_obj->getUnicodeBlocks(); |
911 | unset($sample_obj); | 904 | unset($sample_obj); |
912 | return $blocks; | 905 | return $blocks; |
913 | } | 906 | } |
@@ -915,38 +908,37 @@ class Text_LanguageDetect | |||
915 | /** | 908 | /** |
916 | * Returns the block name for a given unicode value | 909 | * Returns the block name for a given unicode value |
917 | * | 910 | * |
918 | * If passed a string, will assume it is being passed a UTF8-formatted | 911 | * If passed a string, will assume it is being passed a UTF8-formatted |
919 | * character and will automatically convert. Otherwise it will assume it | 912 | * character and will automatically convert. Otherwise it will assume it |
920 | * is being passed a numeric unicode value. | 913 | * is being passed a numeric unicode value. |
921 | * | 914 | * |
922 | * Make sure input is of the correct type! | 915 | * Make sure input is of the correct type! |
923 | * | 916 | * |
924 | * @access public | ||
925 | * @param mixed $unicode unicode value or utf8 char | 917 | * @param mixed $unicode unicode value or utf8 char |
918 | * | ||
926 | * @return mixed the block name string or false if not found | 919 | * @return mixed the block name string or false if not found |
927 | * @throws PEAR_Error | 920 | * @throws Text_LanguageDetect_Exception |
928 | */ | 921 | */ |
929 | function unicodeBlockName($unicode) { | 922 | public function unicodeBlockName($unicode) |
923 | { | ||
930 | if (is_string($unicode)) { | 924 | if (is_string($unicode)) { |
931 | // assume it is being passed a utf8 char, so convert it | 925 | // assume it is being passed a utf8 char, so convert it |
932 | 926 | if (self::utf8strlen($unicode) > 1) { | |
933 | // input check | 927 | throw new Text_LanguageDetect_Exception( |
934 | if ($this->utf8strlen($unicode) > 1) { | 928 | 'Pass a single char only to this method', |
935 | throw new Exception('Pass this function only a single char'); | 929 | Text_LanguageDetect_Exception::PARAM_TYPE |
930 | ); | ||
936 | } | 931 | } |
937 | |||
938 | $unicode = $this->_utf8char2unicode($unicode); | 932 | $unicode = $this->_utf8char2unicode($unicode); |
939 | 933 | ||
940 | if ($unicode == -1) { | ||
941 | throw new Exception('Malformatted char'); | ||
942 | } | ||
943 | |||
944 | // input check | ||
945 | } elseif (!is_int($unicode)) { | 934 | } elseif (!is_int($unicode)) { |
946 | throw new Exception('Input must be of type string or int.'); | 935 | throw new Text_LanguageDetect_Exception( |
936 | 'Input must be of type string or int.', | ||
937 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
938 | ); | ||
947 | } | 939 | } |
948 | 940 | ||
949 | $blocks =& $this->_read_unicode_block_db(); | 941 | $blocks = $this->_read_unicode_block_db(); |
950 | 942 | ||
951 | $result = $this->_unicode_block_name($unicode, $blocks); | 943 | $result = $this->_unicode_block_name($unicode, $blocks); |
952 | 944 | ||
@@ -964,14 +956,17 @@ class Text_LanguageDetect | |||
964 | * the public interface for this function, which does input checks which | 956 | * the public interface for this function, which does input checks which |
965 | * this function omits for speed. | 957 | * this function omits for speed. |
966 | * | 958 | * |
967 | * @access protected | 959 | * @param int $unicode the unicode value |
968 | * @param int $unicode the unicode value | 960 | * @param array $blocks the block database |
969 | * @param array &$blocks the block database | 961 | * @param int $block_count the number of defined blocks in the database |
970 | * @param int $block_count the number of defined blocks in the database | 962 | * |
971 | * @see unicodeBlockName() | 963 | * @return mixed Block name, -1 if it failed |
964 | * @see unicodeBlockName() | ||
965 | * @access protected | ||
972 | */ | 966 | */ |
973 | function _unicode_block_name($unicode, &$blocks, $block_count = -1) { | 967 | function _unicode_block_name($unicode, $blocks, $block_count = -1) |
974 | // for a reference, see | 968 | { |
969 | // for a reference, see | ||
975 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | 970 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt |
976 | 971 | ||
977 | // assume that ascii characters are the most common | 972 | // assume that ascii characters are the most common |
@@ -994,35 +989,36 @@ class Text_LanguageDetect | |||
994 | while ($low <= $high) { | 989 | while ($low <= $high) { |
995 | $mid = floor(($low + $high) / 2); | 990 | $mid = floor(($low + $high) / 2); |
996 | 991 | ||
997 | // if it's lower than the lower bound | ||
998 | if ($unicode < $blocks[$mid][0]) { | 992 | if ($unicode < $blocks[$mid][0]) { |
993 | // if it's lower than the lower bound | ||
999 | $high = $mid - 1; | 994 | $high = $mid - 1; |
1000 | 995 | ||
1001 | // if it's higher than the upper bound | ||
1002 | } elseif ($unicode > $blocks[$mid][1]) { | 996 | } elseif ($unicode > $blocks[$mid][1]) { |
997 | // if it's higher than the upper bound | ||
1003 | $low = $mid + 1; | 998 | $low = $mid + 1; |
1004 | 999 | ||
1005 | // found it | ||
1006 | } else { | 1000 | } else { |
1001 | // found it | ||
1007 | return $blocks[$mid]; | 1002 | return $blocks[$mid]; |
1008 | } | 1003 | } |
1009 | } | 1004 | } |
1010 | 1005 | ||
1011 | // failed to find the block | 1006 | // failed to find the block |
1012 | return -1; | 1007 | return -1; |
1013 | 1008 | ||
1014 | // todo: differentiate when it's out of range or when it falls | 1009 | // todo: differentiate when it's out of range or when it falls |
1015 | // into an unassigned range? | 1010 | // into an unassigned range? |
1016 | } | 1011 | } |
1017 | 1012 | ||
1018 | /** | 1013 | /** |
1019 | * Brings up the unicode block database | 1014 | * Brings up the unicode block database |
1020 | * | 1015 | * |
1021 | * @access protected | ||
1022 | * @return array the database of unicode block definitions | 1016 | * @return array the database of unicode block definitions |
1023 | * @throws PEAR_Error | 1017 | * @throws Text_LanguageDetect_Exception |
1018 | * @access protected | ||
1024 | */ | 1019 | */ |
1025 | function &_read_unicode_block_db() { | 1020 | function _read_unicode_block_db() |
1021 | { | ||
1026 | // since the unicode definitions are always going to be the same, | 1022 | // since the unicode definitions are always going to be the same, |
1027 | // might as well share the memory for the db with all other instances | 1023 | // might as well share the memory for the db with all other instances |
1028 | // of this class | 1024 | // of this class |
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect | |||
1037 | 1033 | ||
1038 | /** | 1034 | /** |
1039 | * Calculate the similarities between the language models | 1035 | * Calculate the similarities between the language models |
1040 | * | 1036 | * |
1041 | * Use this function to see how similar languages are to each other. | 1037 | * Use this function to see how similar languages are to each other. |
1042 | * | 1038 | * |
1043 | * If passed 2 language names, will return just those languages compared. | 1039 | * If passed 2 language names, will return just those languages compared. |
1044 | * If passed 1 language name, will return that language compared to | 1040 | * If passed 1 language name, will return that language compared to |
1045 | * all others. | 1041 | * all others. |
1046 | * If passed none, will return an array of every language model compared | 1042 | * If passed none, will return an array of every language model compared |
1047 | * to every other one. | 1043 | * to every other one. |
1048 | * | 1044 | * |
1049 | * @access public | 1045 | * @param string $lang1 the name of the first language to be compared |
1050 | * @param string $lang1 the name of the first language to be compared | 1046 | * @param string $lang2 the name of the second language to be compared |
1051 | * @param string $lang2 the name of the second language to be compared | 1047 | * |
1052 | * @return array scores of every language compared | 1048 | * @return array scores of every language compared |
1053 | * or the score of just the provided languages | 1049 | * or the score of just the provided languages |
1054 | * or null if one of the supplied languages does not exist | 1050 | * or null if one of the supplied languages does not exist |
1055 | * @throws PEAR_Error | 1051 | * @throws Text_LanguageDetect_Exception |
1056 | */ | 1052 | */ |
1057 | function languageSimilarity($lang1 = null, $lang2 = null) | 1053 | public function languageSimilarity($lang1 = null, $lang2 = null) |
1058 | { | 1054 | { |
1059 | if (!$this->_setup_ok($err)) { | 1055 | $lang1 = $this->_convertFromNameMode($lang1); |
1060 | return $err; | 1056 | $lang2 = $this->_convertFromNameMode($lang2); |
1061 | } | ||
1062 | |||
1063 | if ($lang1 != null) { | 1057 | if ($lang1 != null) { |
1064 | $lang1 = strtolower($lang1); | 1058 | $lang1 = strtolower($lang1); |
1065 | 1059 | ||
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect | |||
1069 | } | 1063 | } |
1070 | 1064 | ||
1071 | if ($lang2 != null) { | 1065 | if ($lang2 != null) { |
1072 | 1066 | if (!isset($this->_lang_db[$lang2])) { | |
1073 | // can't only set the second param | 1067 | // check if language model exists |
1074 | if ($lang1 == null) { | ||
1075 | return null; | ||
1076 | // check if language model exists | ||
1077 | } elseif (!isset($this->_lang_db[$lang2])) { | ||
1078 | return null; | 1068 | return null; |
1079 | } | 1069 | } |
1080 | 1070 | ||
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect | |||
1088 | ) | 1078 | ) |
1089 | ); | 1079 | ); |
1090 | 1080 | ||
1091 | |||
1092 | // compare just $lang1 to all languages | ||
1093 | } else { | 1081 | } else { |
1082 | // compare just $lang1 to all languages | ||
1094 | $return_arr = array(); | 1083 | $return_arr = array(); |
1095 | foreach ($this->_lang_db as $key => $value) { | 1084 | foreach ($this->_lang_db as $key => $value) { |
1096 | if ($key != $lang1) { // don't compare a language to itself | 1085 | if ($key != $lang1) { |
1086 | // don't compare a language to itself | ||
1097 | $return_arr[$key] = $this->_normalize_score( | 1087 | $return_arr[$key] = $this->_normalize_score( |
1098 | $this->_distance($this->_lang_db[$lang1], $value)); | 1088 | $this->_distance($this->_lang_db[$lang1], $value) |
1089 | ); | ||
1099 | } | 1090 | } |
1100 | } | 1091 | } |
1101 | asort($return_arr); | 1092 | asort($return_arr); |
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect | |||
1104 | } | 1095 | } |
1105 | 1096 | ||
1106 | 1097 | ||
1107 | // compare all languages to each other | ||
1108 | } else { | 1098 | } else { |
1099 | // compare all languages to each other | ||
1109 | $return_arr = array(); | 1100 | $return_arr = array(); |
1110 | foreach (array_keys($this->_lang_db) as $lang1) { | 1101 | foreach (array_keys($this->_lang_db) as $lang1) { |
1111 | foreach (array_keys($this->_lang_db) as $lang2) { | 1102 | foreach (array_keys($this->_lang_db) as $lang2) { |
1112 | |||
1113 | // skip comparing languages to themselves | 1103 | // skip comparing languages to themselves |
1114 | if ($lang1 != $lang2) { | 1104 | if ($lang1 != $lang2) { |
1115 | |||
1116 | // don't re-calculate what's already been done | ||
1117 | if (isset($return_arr[$lang2][$lang1])) { | ||
1118 | 1105 | ||
1119 | $return_arr[$lang1][$lang2] = | 1106 | if (isset($return_arr[$lang2][$lang1])) { |
1120 | $return_arr[$lang2][$lang1]; | 1107 | // don't re-calculate what's already been done |
1108 | $return_arr[$lang1][$lang2] | ||
1109 | = $return_arr[$lang2][$lang1]; | ||
1121 | 1110 | ||
1122 | // calculate | ||
1123 | } else { | 1111 | } else { |
1124 | 1112 | // calculate | |
1125 | $return_arr[$lang1][$lang2] = | 1113 | $return_arr[$lang1][$lang2] |
1126 | $this->_normalize_score( | 1114 | = $this->_normalize_score( |
1127 | $this->_distance( | 1115 | $this->_distance( |
1128 | $this->_lang_db[$lang1], | 1116 | $this->_lang_db[$lang1], |
1129 | $this->_lang_db[$lang2] | 1117 | $this->_lang_db[$lang2] |
1130 | ) | 1118 | ) |
1131 | ); | 1119 | ); |
1132 | 1120 | ||
1133 | } | 1121 | } |
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect | |||
1150 | * | 1138 | * |
1151 | * @access public | 1139 | * @access public |
1152 | * @return array language cluster data | 1140 | * @return array language cluster data |
1153 | * @throws PEAR_Error | 1141 | * @throws Text_LanguageDetect_Exception |
1154 | * @see languageSimilarity() | 1142 | * @see languageSimilarity() |
1155 | * @deprecated this function will eventually be removed and placed into | 1143 | * @deprecated this function will eventually be removed and placed into |
1156 | * the model generation class | 1144 | * the model generation class |
1157 | */ | 1145 | */ |
1158 | function clusterLanguages() | 1146 | function clusterLanguages() |
1159 | { | 1147 | { |
1160 | // todo: set the maximum number of clusters | 1148 | // todo: set the maximum number of clusters |
1161 | |||
1162 | // setup check | ||
1163 | if (!$this->_setup_ok($err)) { | ||
1164 | return $err; | ||
1165 | } | ||
1166 | |||
1167 | // return cached result, if any | 1149 | // return cached result, if any |
1168 | if (isset($this->_clusters)) { | 1150 | if (isset($this->_clusters)) { |
1169 | return $this->_clusters; | 1151 | return $this->_clusters; |
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect | |||
1177 | 1159 | ||
1178 | foreach ($langs as $lang) { | 1160 | foreach ($langs as $lang) { |
1179 | if (!isset($this->_lang_db[$lang])) { | 1161 | if (!isset($this->_lang_db[$lang])) { |
1180 | throw new Exception("missing $lang!\n"); | 1162 | throw new Text_LanguageDetect_Exception( |
1163 | "missing $lang!", | ||
1164 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | ||
1165 | ); | ||
1181 | } | 1166 | } |
1182 | } | 1167 | } |
1183 | 1168 | ||
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect | |||
1186 | $langs[$lang1] = $lang1; | 1171 | $langs[$lang1] = $lang1; |
1187 | unset($langs[$old_key]); | 1172 | unset($langs[$old_key]); |
1188 | } | 1173 | } |
1189 | 1174 | ||
1175 | $result_data = $really_map = array(); | ||
1176 | |||
1190 | $i = 0; | 1177 | $i = 0; |
1191 | while (count($langs) > 2 && $i++ < 200) { | 1178 | while (count($langs) > 2 && $i++ < 200) { |
1192 | $highest_score = -1; | 1179 | $highest_score = -1; |
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect | |||
1194 | $highest_key2 = ''; | 1181 | $highest_key2 = ''; |
1195 | foreach ($langs as $lang1) { | 1182 | foreach ($langs as $lang1) { |
1196 | foreach ($langs as $lang2) { | 1183 | foreach ($langs as $lang2) { |
1197 | if ( $lang1 != $lang2 | 1184 | if ($lang1 != $lang2 |
1198 | && $arr[$lang1][$lang2] > $highest_score) { | 1185 | && $arr[$lang1][$lang2] > $highest_score |
1186 | ) { | ||
1199 | $highest_score = $arr[$lang1][$lang2]; | 1187 | $highest_score = $arr[$lang1][$lang2]; |
1200 | $highest_key1 = $lang1; | 1188 | $highest_key1 = $lang1; |
1201 | $highest_key2 = $lang2; | 1189 | $highest_key2 = $lang2; |
1202 | } | 1190 | } |
1203 | } | 1191 | } |
1204 | } | 1192 | } |
1205 | 1193 | ||
1206 | if (!$highest_key1) { | 1194 | if (!$highest_key1) { |
1207 | // should not ever happen | 1195 | // should not ever happen |
1208 | throw new Exception("no highest key? (step: $i)"); | 1196 | throw new Text_LanguageDetect_Exception( |
1197 | "no highest key? (step: $i)", | ||
1198 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | ||
1199 | ); | ||
1209 | } | 1200 | } |
1210 | 1201 | ||
1211 | if ($highest_score == 0) { | 1202 | if ($highest_score == 0) { |
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect | |||
1217 | $sum1 = array_sum($arr[$highest_key1]); | 1208 | $sum1 = array_sum($arr[$highest_key1]); |
1218 | $sum2 = array_sum($arr[$highest_key2]); | 1209 | $sum2 = array_sum($arr[$highest_key2]); |
1219 | 1210 | ||
1220 | // use the score for the one that is most similar to the rest of | 1211 | // use the score for the one that is most similar to the rest of |
1221 | // the field as the score for the group | 1212 | // the field as the score for the group |
1222 | // todo: could try averaging or "centroid" method instead | 1213 | // todo: could try averaging or "centroid" method instead |
1223 | // seems like that might make more sense | 1214 | // seems like that might make more sense |
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect | |||
1248 | $really_lang = $replaceme; | 1239 | $really_lang = $replaceme; |
1249 | while (isset($really_map[$really_lang])) { | 1240 | while (isset($really_map[$really_lang])) { |
1250 | $really_lang = $really_map[$really_lang]; | 1241 | $really_lang = $really_map[$really_lang]; |
1251 | } | 1242 | } |
1252 | $really_map[$newkey] = $really_lang; | 1243 | $really_map[$newkey] = $really_lang; |
1253 | 1244 | ||
1254 | 1245 | ||
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect | |||
1259 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | 1250 | $arr[$key1][$newkey] = $arr[$key1][$key2]; |
1260 | unset($arr[$key1][$key2]); | 1251 | unset($arr[$key1][$key2]); |
1261 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | 1252 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] |
1262 | } | 1253 | } |
1263 | 1254 | ||
1264 | if ($key1 == $replaceme) { | 1255 | if ($key1 == $replaceme) { |
1265 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | 1256 | $arr[$newkey][$key2] = $arr[$key1][$key2]; |
1266 | unset($arr[$key1][$key2]); | 1257 | unset($arr[$key1][$key2]); |
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect | |||
1273 | } | 1264 | } |
1274 | } | 1265 | } |
1275 | } | 1266 | } |
1276 | 1267 | ||
1277 | 1268 | ||
1278 | unset($langs[$highest_key1]); | 1269 | unset($langs[$highest_key1]); |
1279 | unset($langs[$highest_key2]); | 1270 | unset($langs[$highest_key2]); |
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect | |||
1293 | } | 1284 | } |
1294 | 1285 | ||
1295 | $return_val = array( | 1286 | $return_val = array( |
1296 | 'open_forks' => $langs, | 1287 | 'open_forks' => $langs, |
1297 | // the top level of clusters | 1288 | // the top level of clusters |
1298 | // clusters that are mutually exclusive | 1289 | // clusters that are mutually exclusive |
1299 | // or specified by a specific maximum | 1290 | // or specified by a specific maximum |
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect | |||
1323 | * use, and it may disappear or its functionality may change in future | 1314 | * use, and it may disappear or its functionality may change in future |
1324 | * releases without notice. | 1315 | * releases without notice. |
1325 | * | 1316 | * |
1326 | * This compares the sample text to top the top level of clusters. If the | 1317 | * This compares the sample text to top the top level of clusters. If the |
1327 | * sample is similar to the cluster it will drop down and compare it to the | 1318 | * sample is similar to the cluster it will drop down and compare it to the |
1328 | * languages in the cluster, and so on until it hits a leaf node. | 1319 | * languages in the cluster, and so on until it hits a leaf node. |
1329 | * | 1320 | * |
1330 | * this should find the language in considerably fewer compares | 1321 | * this should find the language in considerably fewer compares |
1331 | * (the equivalent of a binary search), however clusterLanguages() is costly | 1322 | * (the equivalent of a binary search), however clusterLanguages() is costly |
1332 | * and the loss of accuracy from this technique is significant. | 1323 | * and the loss of accuracy from this technique is significant. |
1333 | * | 1324 | * |
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect | |||
1337 | * was very large, however in such cases some method of Bayesian inference | 1328 | * was very large, however in such cases some method of Bayesian inference |
1338 | * might be more helpful. | 1329 | * might be more helpful. |
1339 | * | 1330 | * |
1340 | * @see clusterLanguages() | 1331 | * @param string $str input string |
1341 | * @access public | 1332 | * |
1342 | * @param string $str input string | 1333 | * @return array language scores (only those compared) |
1343 | * @return array language scores (only those compared) | 1334 | * @throws Text_LanguageDetect_Exception |
1344 | * @throws PEAR_Error | 1335 | * @see clusterLanguages() |
1345 | */ | 1336 | */ |
1346 | function clusteredSearch($str) | 1337 | public function clusteredSearch($str) |
1347 | { | 1338 | { |
1348 | |||
1349 | // input check | 1339 | // input check |
1350 | if (!Text_LanguageDetect_Parser::validateString($str)) { | 1340 | if (!Text_LanguageDetect_Parser::validateString($str)) { |
1351 | return array(); | 1341 | return array(); |
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect | |||
1359 | $dendogram_data = $result['fork_data']; | 1349 | $dendogram_data = $result['fork_data']; |
1360 | $dendogram_alias = $result['name_map']; | 1350 | $dendogram_alias = $result['name_map']; |
1361 | 1351 | ||
1362 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 1352 | $sample_obj = new Text_LanguageDetect_Parser($str); |
1363 | $sample_obj->prepareTrigram(); | 1353 | $sample_obj->prepareTrigram(); |
1364 | $sample_obj->setPadStart(!$this->_perl_compatible); | 1354 | $sample_obj->setPadStart(!$this->_perl_compatible); |
1365 | $sample_obj->analyze(); | 1355 | $sample_obj->analyze(); |
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect | |||
1372 | } | 1362 | } |
1373 | 1363 | ||
1374 | $i = 0; // counts the number of steps | 1364 | $i = 0; // counts the number of steps |
1375 | 1365 | ||
1376 | foreach ($dendogram_start as $lang) { | 1366 | foreach ($dendogram_start as $lang) { |
1377 | if (isset($dendogram_alias[$lang])) { | 1367 | if (isset($dendogram_alias[$lang])) { |
1378 | $lang_key = $dendogram_alias[$lang]; | 1368 | $lang_key = $dendogram_alias[$lang]; |
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect | |||
1382 | 1372 | ||
1383 | $scores[$lang] = $this->_normalize_score( | 1373 | $scores[$lang] = $this->_normalize_score( |
1384 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1374 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1385 | $sample_count); | 1375 | $sample_count |
1376 | ); | ||
1386 | 1377 | ||
1387 | $i++; | 1378 | $i++; |
1388 | } | 1379 | } |
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect | |||
1411 | 1402 | ||
1412 | $scores[$lang] = $this->_normalize_score( | 1403 | $scores[$lang] = $this->_normalize_score( |
1413 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1404 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1414 | $sample_count); | 1405 | $sample_count |
1406 | ); | ||
1415 | 1407 | ||
1416 | //todo: does not need to do same comparison again | 1408 | //todo: does not need to do same comparison again |
1417 | } | 1409 | } |
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect | |||
1428 | 1420 | ||
1429 | $diff = $scores[$cur_key] - $scores[$loser_key]; | 1421 | $diff = $scores[$cur_key] - $scores[$loser_key]; |
1430 | 1422 | ||
1431 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | 1423 | // $cur_key ({$dendogram_alias[$cur_key]}) wins |
1432 | // over $loser_key ({$dendogram_alias[$loser_key]}) | 1424 | // over $loser_key ({$dendogram_alias[$loser_key]}) |
1433 | // with a difference of $diff | 1425 | // with a difference of $diff |
1434 | } | 1426 | } |
1435 | 1427 | ||
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect | |||
1439 | // which paths the algorithm decided to take along the tree | 1431 | // which paths the algorithm decided to take along the tree |
1440 | 1432 | ||
1441 | // but sometimes the last item is only the second highest | 1433 | // but sometimes the last item is only the second highest |
1442 | if ( ($this->_perl_compatible && (end($scores) > prev($scores))) | 1434 | if (($this->_perl_compatible && (end($scores) > prev($scores))) |
1443 | || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { | 1435 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) |
1444 | 1436 | ) { | |
1445 | $real_last_score = current($scores); | 1437 | $real_last_score = current($scores); |
1446 | $real_last_key = key($scores); | 1438 | $real_last_key = key($scores); |
1447 | 1439 | ||
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect | |||
1449 | unset($scores[$real_last_key]); | 1441 | unset($scores[$real_last_key]); |
1450 | $scores[$real_last_key] = $real_last_score; | 1442 | $scores[$real_last_key] = $real_last_score; |
1451 | } | 1443 | } |
1452 | 1444 | ||
1453 | 1445 | ||
1454 | if (!$this->_perl_compatible) { | 1446 | if (!$this->_perl_compatible) { |
1455 | $scores = array_reverse($scores, true); | 1447 | $scores = array_reverse($scores, true); |
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect | |||
1464 | * | 1456 | * |
1465 | * Returns the numbers of characters (not bytes) in a utf8 string | 1457 | * Returns the numbers of characters (not bytes) in a utf8 string |
1466 | * | 1458 | * |
1467 | * @static | 1459 | * @param string $str string to get the length of |
1468 | * @access public | 1460 | * |
1469 | * @param string $str string to get the length of | 1461 | * @return int number of chars |
1470 | * @return int number of chars | ||
1471 | */ | 1462 | */ |
1472 | function utf8strlen($str) | 1463 | public static function utf8strlen($str) |
1473 | { | 1464 | { |
1474 | // utf8_decode() will convert unknown chars to '?', which is actually | 1465 | // utf8_decode() will convert unknown chars to '?', which is actually |
1475 | // ideal for counting. | 1466 | // ideal for counting. |
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect | |||
1482 | /** | 1473 | /** |
1483 | * Returns the unicode value of a utf8 char | 1474 | * Returns the unicode value of a utf8 char |
1484 | * | 1475 | * |
1485 | * @access protected | 1476 | * @param string $char a utf8 (possibly multi-byte) char |
1486 | * @param string $char a utf8 (possibly multi-byte) char | 1477 | * |
1487 | * @return int unicode value or -1 if malformatted | 1478 | * @return int unicode value |
1479 | * @access protected | ||
1480 | * @link http://en.wikipedia.org/wiki/UTF-8 | ||
1488 | */ | 1481 | */ |
1489 | function _utf8char2unicode($char) { | 1482 | function _utf8char2unicode($char) |
1490 | 1483 | { | |
1491 | // strlen() here will actually get the binary length of a single char | 1484 | // strlen() here will actually get the binary length of a single char |
1492 | switch (strlen($char)) { | 1485 | switch (strlen($char)) { |
1493 | 1486 | case 1: | |
1494 | // for a reference, see http://en.wikipedia.org/wiki/UTF-8 | 1487 | // normal ASCII-7 byte |
1495 | 1488 | // 0xxxxxxx --> 0xxxxxxx | |
1496 | case 1: | 1489 | return ord($char{0}); |
1497 | // normal ASCII-7 byte | 1490 | |
1498 | // 0xxxxxxx --> 0xxxxxxx | 1491 | case 2: |
1499 | return ord($char{0}); | 1492 | // 2 byte unicode |
1500 | 1493 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1501 | case 2: | 1494 | $z = (ord($char{0}) & 0x000001F) << 6; |
1502 | // 2 byte unicode | 1495 | $x = (ord($char{1}) & 0x0000003F); |
1503 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | 1496 | return ($z | $x); |
1504 | $z = (ord($char{0}) & 0x000001F) << 6; | 1497 | |
1505 | $x = (ord($char{1}) & 0x0000003F); | 1498 | case 3: |
1506 | 1499 | // 3 byte unicode | |
1507 | return ($z | $x); | 1500 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx |
1508 | 1501 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1509 | case 3: | 1502 | $x1 = (ord($char{1}) & 0x0000003F) << 6; |
1510 | // 3 byte unicode | 1503 | $x2 = (ord($char{2}) & 0x0000003F); |
1511 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | 1504 | return ($z | $x1 | $x2); |
1512 | $z = (ord($char{0}) & 0x0000000F) << 12; | 1505 | |
1513 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | 1506 | case 4: |
1514 | $x2 = (ord($char{2}) & 0x0000003F); | 1507 | // 4 byte unicode |
1515 | 1508 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1516 | return ($z | $x1 | $x2); | 1509 | // 000zzzzz xxxxxxxx xxxxxxxx |
1517 | 1510 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1518 | case 4: | 1511 | $z2 = (ord($char{1}) & 0x0000003F) << 12; |
1519 | // 4 byte unicode | 1512 | $x1 = (ord($char{2}) & 0x0000003F) << 6; |
1520 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | 1513 | $x2 = (ord($char{3}) & 0x0000003F); |
1521 | // 000zzzzz xxxxxxxx xxxxxxxx | 1514 | return ($z1 | $z2 | $x1 | $x2); |
1522 | $z1 = (ord($char{0}) & 0x00000007) << 18; | ||
1523 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | ||
1524 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | ||
1525 | $x2 = (ord($char{3}) & 0x0000003F); | ||
1526 | |||
1527 | return ($z1 | $z2 | $x1 | $x2); | ||
1528 | |||
1529 | default: | ||
1530 | // error: malformatted char? | ||
1531 | return -1; | ||
1532 | } | 1515 | } |
1533 | } | 1516 | } |
1534 | 1517 | ||
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect | |||
1536 | * utf8-safe fast character iterator | 1519 | * utf8-safe fast character iterator |
1537 | * | 1520 | * |
1538 | * Will get the next character starting from $counter, which will then be | 1521 | * Will get the next character starting from $counter, which will then be |
1539 | * incremented. If a multi-byte char the bytes will be concatenated and | 1522 | * incremented. If a multi-byte char the bytes will be concatenated and |
1540 | * $counter will be incremeted by the number of bytes in the char. | 1523 | * $counter will be incremeted by the number of bytes in the char. |
1541 | * | 1524 | * |
1542 | * @access private | 1525 | * @param string $str the string being iterated over |
1543 | * @param string &$str the string being iterated over | 1526 | * @param int &$counter the iterator, will increment by reference |
1544 | * @param int &$counter the iterator, will increment by reference | 1527 | * @param bool $special_convert whether to do special conversions |
1545 | * @param bool $special_convert whether to do special conversions | 1528 | * |
1546 | * @return char the next (possibly multi-byte) char from $counter | 1529 | * @return char the next (possibly multi-byte) char from $counter |
1530 | * @access private | ||
1547 | */ | 1531 | */ |
1548 | function _next_char(&$str, &$counter, $special_convert = false) | 1532 | static function _next_char($str, &$counter, $special_convert = false) |
1549 | { | 1533 | { |
1550 | |||
1551 | $char = $str{$counter++}; | 1534 | $char = $str{$counter++}; |
1552 | $ord = ord($char); | 1535 | $ord = ord($char); |
1553 | 1536 | ||
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect | |||
1556 | 1539 | ||
1557 | // normal ascii one byte char | 1540 | // normal ascii one byte char |
1558 | if ($ord <= 127) { | 1541 | if ($ord <= 127) { |
1559 | |||
1560 | // special conversions needed for this package | 1542 | // special conversions needed for this package |
1561 | // (that only apply to regular ascii characters) | 1543 | // (that only apply to regular ascii characters) |
1562 | // lower case, and convert all non-alphanumeric characters | 1544 | // lower case, and convert all non-alphanumeric characters |
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect | |||
1571 | 1553 | ||
1572 | return $char; | 1554 | return $char; |
1573 | 1555 | ||
1574 | // multi-byte chars | ||
1575 | } elseif ($ord >> 5 == 6) { // two-byte char | 1556 | } elseif ($ord >> 5 == 6) { // two-byte char |
1557 | // multi-byte chars | ||
1576 | $nextchar = $str{$counter++}; // get next byte | 1558 | $nextchar = $str{$counter++}; // get next byte |
1577 | 1559 | ||
1578 | // lower-casing of non-ascii characters is still incomplete | 1560 | // lower-casing of non-ascii characters is still incomplete |
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect | |||
1582 | if ($ord == 195) { | 1564 | if ($ord == 195) { |
1583 | $nextord = ord($nextchar); | 1565 | $nextord = ord($nextchar); |
1584 | $nextord_adj = $nextord + 64; | 1566 | $nextord_adj = $nextord + 64; |
1585 | // for a reference, see | 1567 | // for a reference, see |
1586 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | 1568 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html |
1587 | 1569 | ||
1588 | // À - Þ but not × | 1570 | // À - Þ but not × |
1589 | if ( $nextord_adj >= 192 | 1571 | if ($nextord_adj >= 192 |
1590 | && $nextord_adj <= 222 | 1572 | && $nextord_adj <= 222 |
1591 | && $nextord_adj != 215) { | 1573 | && $nextord_adj != 215 |
1592 | 1574 | ) { | |
1593 | $nextchar = chr($nextord + 32); | 1575 | $nextchar = chr($nextord + 32); |
1594 | } | 1576 | } |
1595 | 1577 | ||
1596 | // lower case cyrillic alphabet | ||
1597 | } elseif ($ord == 208) { | 1578 | } elseif ($ord == 208) { |
1579 | // lower case cyrillic alphabet | ||
1598 | $nextord = ord($nextchar); | 1580 | $nextord = ord($nextchar); |
1599 | // if A - Pe | 1581 | // if A - Pe |
1600 | if ($nextord >= 144 && $nextord <= 159) { | 1582 | if ($nextord >= 144 && $nextord <= 159) { |
1601 | // lower case | 1583 | // lower case |
1602 | $nextchar = chr($nextord + 32); | 1584 | $nextchar = chr($nextord + 32); |
1603 | 1585 | ||
1604 | // if Er - Ya | ||
1605 | } elseif ($nextord >= 160 && $nextord <= 175) { | 1586 | } elseif ($nextord >= 160 && $nextord <= 175) { |
1587 | // if Er - Ya | ||
1606 | // lower case | 1588 | // lower case |
1607 | $char = chr(209); // == $ord++ | 1589 | $char = chr(209); // == $ord++ |
1608 | $nextchar = chr($nextord - 32); | 1590 | $nextchar = chr($nextord - 32); |
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect | |||
1611 | } | 1593 | } |
1612 | 1594 | ||
1613 | // tag on next byte | 1595 | // tag on next byte |
1614 | return $char . $nextchar; | 1596 | return $char . $nextchar; |
1615 | |||
1616 | } elseif ($ord >> 4 == 14) { // three-byte char | 1597 | } elseif ($ord >> 4 == 14) { // three-byte char |
1617 | 1598 | ||
1618 | // tag on next 2 bytes | 1599 | // tag on next 2 bytes |
1619 | return $char . $str{$counter++} . $str{$counter++}; | 1600 | return $char . $str{$counter++} . $str{$counter++}; |
1620 | 1601 | ||
1621 | } elseif ($ord >> 3 == 30) { // four-byte char | 1602 | } elseif ($ord >> 3 == 30) { // four-byte char |
1622 | 1603 | ||
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect | |||
1628 | } | 1609 | } |
1629 | } | 1610 | } |
1630 | 1611 | ||
1631 | } | 1612 | /** |
1613 | * Converts an $language input parameter from the configured mode | ||
1614 | * to the language name that is used internally. | ||
1615 | * | ||
1616 | * Works for strings and arrays. | ||
1617 | * | ||
1618 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1619 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1620 | * converts the keys to the language name. | ||
1621 | * | ||
1622 | * @return string|array Language name | ||
1623 | */ | ||
1624 | function _convertFromNameMode($lang, $convertKey = false) | ||
1625 | { | ||
1626 | if ($this->_name_mode == 0) { | ||
1627 | return $lang; | ||
1628 | } | ||
1629 | |||
1630 | if ($this->_name_mode == 2) { | ||
1631 | $method = 'code2ToName'; | ||
1632 | } else { | ||
1633 | $method = 'code3ToName'; | ||
1634 | } | ||
1635 | |||
1636 | if (is_string($lang)) { | ||
1637 | return (string)Text_LanguageDetect_ISO639::$method($lang); | ||
1638 | } | ||
1639 | |||
1640 | $newlang = array(); | ||
1641 | foreach ($lang as $key => $val) { | ||
1642 | if ($convertKey) { | ||
1643 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | ||
1644 | $newlang[$newkey] = $val; | ||
1645 | } else { | ||
1646 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | ||
1647 | } | ||
1648 | } | ||
1649 | return $newlang; | ||
1650 | } | ||
1632 | 1651 | ||
1633 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | 1652 | /** |
1653 | * Converts an $language output parameter from the language name that is | ||
1654 | * used internally to the configured mode. | ||
1655 | * | ||
1656 | * Works for strings and arrays. | ||
1657 | * | ||
1658 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1659 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1660 | * converts the keys to the language name. | ||
1661 | * | ||
1662 | * @return string|array Language name | ||
1663 | */ | ||
1664 | function _convertToNameMode($lang, $convertKey = false) | ||
1665 | { | ||
1666 | if ($this->_name_mode == 0) { | ||
1667 | return $lang; | ||
1668 | } | ||
1669 | |||
1670 | if ($this->_name_mode == 2) { | ||
1671 | $method = 'nameToCode2'; | ||
1672 | } else { | ||
1673 | $method = 'nameToCode3'; | ||
1674 | } | ||
1675 | |||
1676 | if (is_string($lang)) { | ||
1677 | return Text_LanguageDetect_ISO639::$method($lang); | ||
1678 | } | ||
1679 | |||
1680 | $newlang = array(); | ||
1681 | foreach ($lang as $key => $val) { | ||
1682 | if ($convertKey) { | ||
1683 | $newkey = Text_LanguageDetect_ISO639::$method($key); | ||
1684 | $newlang[$newkey] = $val; | ||
1685 | } else { | ||
1686 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | ||
1687 | } | ||
1688 | } | ||
1689 | return $newlang; | ||
1690 | } | ||
1691 | } | ||
1634 | 1692 | ||
1635 | ?> | 1693 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file |