aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/libraries/language-detect/LanguageDetect
diff options
context:
space:
mode:
authorNicolas LÅ“uillet <nicolas@loeuillet.org>2014-05-29 18:54:06 +0200
committerNicolas LÅ“uillet <nicolas@loeuillet.org>2014-05-29 18:54:06 +0200
commita9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch)
tree80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/language-detect/LanguageDetect
parent96834a47b09985e1c82b82857fc108f20e8b8f2b (diff)
parent8038b38802769031e050c753fc0a388a2276629e (diff)
downloadwallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz
wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst
wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip
Merge pull request #712 from wallabag/dev1.7.0
1.7, call me "Premium version"
Diffstat (limited to 'inc/3rdparty/libraries/language-detect/LanguageDetect')
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php57
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php339
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php347
3 files changed, 743 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
1<?php
2class Text_LanguageDetect_Exception extends Exception
3{
4 /**
5 * Database file could not be found
6 */
7 const DB_NOT_FOUND = 10;
8
9 /**
10 * Database file found, but not readable
11 */
12 const DB_NOT_READABLE = 11;
13
14 /**
15 * Database file is empty
16 */
17 const DB_EMPTY = 12;
18
19 /**
20 * Database contents is not a PHP array
21 */
22 const DB_NOT_ARRAY = 13;
23
24 /**
25 * Magic quotes are activated
26 */
27 const MAGIC_QUOTES = 14;
28
29
30 /**
31 * Parameter of invalid type passed to method
32 */
33 const PARAM_TYPE = 20;
34
35 /**
36 * Character in parameter is invalid
37 */
38 const INVALID_CHAR = 21;
39
40
41 /**
42 * Language is not in the database
43 */
44 const UNKNOWN_LANGUAGE = 30;
45
46
47 /**
48 * Error during block detection
49 */
50 const BLOCK_DETECTION = 40;
51
52
53 /**
54 * Error while clustering languages
55 */
56 const NO_HIGHEST_KEY = 50;
57}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
1<?php
2/**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Christian Weiske <cweiske@php.net>
10 * @copyright 2011 Christian Weiske <cweiske@php.net>
11 * @license http://www.debian.org/misc/bsd.license BSD
12 * @version SVN: $Id$
13 * @link http://pear.php.net/package/Text_LanguageDetect/
14 */
15
16/**
17 * Provides a mapping between the languages from lang.dat and the
18 * ISO 639-1 and ISO-639-2 codes.
19 *
20 * Note that this class contains only languages that exist in lang.dat.
21 *
22 * @category Text
23 * @package Text_LanguageDetect
24 * @author Christian Weiske <cweiske@php.net>
25 * @copyright 2011 Christian Weiske <cweiske@php.net>
26 * @license http://www.debian.org/misc/bsd.license BSD
27 * @link http://www.loc.gov/standards/iso639-2/php/code_list.php
28 */
29class Text_LanguageDetect_ISO639
30{
31 /**
32 * Maps all language names from the language database to the
33 * ISO 639-1 2-letter language code.
34 *
35 * NULL indicates that there is no 2-letter code.
36 *
37 * @var array
38 */
39 public static $nameToCode2 = array(
40 'albanian' => 'sq',
41 'arabic' => 'ar',
42 'azeri' => 'az',
43 'bengali' => 'bn',
44 'bulgarian' => 'bg',
45 'cebuano' => null,
46 'croatian' => 'hr',
47 'czech' => 'cs',
48 'danish' => 'da',
49 'dutch' => 'nl',
50 'english' => 'en',
51 'estonian' => 'et',
52 'farsi' => 'fa',
53 'finnish' => 'fi',
54 'french' => 'fr',
55 'german' => 'de',
56 'hausa' => 'ha',
57 'hawaiian' => null,
58 'hindi' => 'hi',
59 'hungarian' => 'hu',
60 'icelandic' => 'is',
61 'indonesian' => 'id',
62 'italian' => 'it',
63 'kazakh' => 'kk',
64 'kyrgyz' => 'ky',
65 'latin' => 'la',
66 'latvian' => 'lv',
67 'lithuanian' => 'lt',
68 'macedonian' => 'mk',
69 'mongolian' => 'mn',
70 'nepali' => 'ne',
71 'norwegian' => 'no',
72 'pashto' => 'ps',
73 'pidgin' => null,
74 'polish' => 'pl',
75 'portuguese' => 'pt',
76 'romanian' => 'ro',
77 'russian' => 'ru',
78 'serbian' => 'sr',
79 'slovak' => 'sk',
80 'slovene' => 'sl',
81 'somali' => 'so',
82 'spanish' => 'es',
83 'swahili' => 'sw',
84 'swedish' => 'sv',
85 'tagalog' => 'tl',
86 'turkish' => 'tr',
87 'ukrainian' => 'uk',
88 'urdu' => 'ur',
89 'uzbek' => 'uz',
90 'vietnamese' => 'vi',
91 'welsh' => 'cy',
92 );
93
94 /**
95 * Maps all language names from the language database to the
96 * ISO 639-2 3-letter language code.
97 *
98 * @var array
99 */
100 public static $nameToCode3 = array(
101 'albanian' => 'sqi',
102 'arabic' => 'ara',
103 'azeri' => 'aze',
104 'bengali' => 'ben',
105 'bulgarian' => 'bul',
106 'cebuano' => 'ceb',
107 'croatian' => 'hrv',
108 'czech' => 'ces',
109 'danish' => 'dan',
110 'dutch' => 'nld',
111 'english' => 'eng',
112 'estonian' => 'est',
113 'farsi' => 'fas',
114 'finnish' => 'fin',
115 'french' => 'fra',
116 'german' => 'deu',
117 'hausa' => 'hau',
118 'hawaiian' => 'haw',
119 'hindi' => 'hin',
120 'hungarian' => 'hun',
121 'icelandic' => 'isl',
122 'indonesian' => 'ind',
123 'italian' => 'ita',
124 'kazakh' => 'kaz',
125 'kyrgyz' => 'kir',
126 'latin' => 'lat',
127 'latvian' => 'lav',
128 'lithuanian' => 'lit',
129 'macedonian' => 'mkd',
130 'mongolian' => 'mon',
131 'nepali' => 'nep',
132 'norwegian' => 'nor',
133 'pashto' => 'pus',
134 'pidgin' => 'crp',
135 'polish' => 'pol',
136 'portuguese' => 'por',
137 'romanian' => 'ron',
138 'russian' => 'rus',
139 'serbian' => 'srp',
140 'slovak' => 'slk',
141 'slovene' => 'slv',
142 'somali' => 'som',
143 'spanish' => 'spa',
144 'swahili' => 'swa',
145 'swedish' => 'swe',
146 'tagalog' => 'tgl',
147 'turkish' => 'tur',
148 'ukrainian' => 'ukr',
149 'urdu' => 'urd',
150 'uzbek' => 'uzb',
151 'vietnamese' => 'vie',
152 'welsh' => 'cym',
153 );
154
155 /**
156 * Maps ISO 639-1 2-letter language codes to the language names
157 * in the language database
158 *
159 * Not all languages have a 2 letter code, so some are missing
160 *
161 * @var array
162 */
163 public static $code2ToName = array(
164 'ar' => 'arabic',
165 'az' => 'azeri',
166 'bg' => 'bulgarian',
167 'bn' => 'bengali',
168 'cs' => 'czech',
169 'cy' => 'welsh',
170 'da' => 'danish',
171 'de' => 'german',
172 'en' => 'english',
173 'es' => 'spanish',
174 'et' => 'estonian',
175 'fa' => 'farsi',
176 'fi' => 'finnish',
177 'fr' => 'french',
178 'ha' => 'hausa',
179 'hi' => 'hindi',
180 'hr' => 'croatian',
181 'hu' => 'hungarian',
182 'id' => 'indonesian',
183 'is' => 'icelandic',
184 'it' => 'italian',
185 'kk' => 'kazakh',
186 'ky' => 'kyrgyz',
187 'la' => 'latin',
188 'lt' => 'lithuanian',
189 'lv' => 'latvian',
190 'mk' => 'macedonian',
191 'mn' => 'mongolian',
192 'ne' => 'nepali',
193 'nl' => 'dutch',
194 'no' => 'norwegian',
195 'pl' => 'polish',
196 'ps' => 'pashto',
197 'pt' => 'portuguese',
198 'ro' => 'romanian',
199 'ru' => 'russian',
200 'sk' => 'slovak',
201 'sl' => 'slovene',
202 'so' => 'somali',
203 'sq' => 'albanian',
204 'sr' => 'serbian',
205 'sv' => 'swedish',
206 'sw' => 'swahili',
207 'tl' => 'tagalog',
208 'tr' => 'turkish',
209 'uk' => 'ukrainian',
210 'ur' => 'urdu',
211 'uz' => 'uzbek',
212 'vi' => 'vietnamese',
213 );
214
215 /**
216 * Maps ISO 639-2 3-letter language codes to the language names
217 * in the language database.
218 *
219 * @var array
220 */
221 public static $code3ToName = array(
222 'ara' => 'arabic',
223 'aze' => 'azeri',
224 'ben' => 'bengali',
225 'bul' => 'bulgarian',
226 'ceb' => 'cebuano',
227 'ces' => 'czech',
228 'crp' => 'pidgin',
229 'cym' => 'welsh',
230 'dan' => 'danish',
231 'deu' => 'german',
232 'eng' => 'english',
233 'est' => 'estonian',
234 'fas' => 'farsi',
235 'fin' => 'finnish',
236 'fra' => 'french',
237 'hau' => 'hausa',
238 'haw' => 'hawaiian',
239 'hin' => 'hindi',
240 'hrv' => 'croatian',
241 'hun' => 'hungarian',
242 'ind' => 'indonesian',
243 'isl' => 'icelandic',
244 'ita' => 'italian',
245 'kaz' => 'kazakh',
246 'kir' => 'kyrgyz',
247 'lat' => 'latin',
248 'lav' => 'latvian',
249 'lit' => 'lithuanian',
250 'mkd' => 'macedonian',
251 'mon' => 'mongolian',
252 'nep' => 'nepali',
253 'nld' => 'dutch',
254 'nor' => 'norwegian',
255 'pol' => 'polish',
256 'por' => 'portuguese',
257 'pus' => 'pashto',
258 'rom' => 'romanian',
259 'rus' => 'russian',
260 'slk' => 'slovak',
261 'slv' => 'slovene',
262 'som' => 'somali',
263 'spa' => 'spanish',
264 'sqi' => 'albanian',
265 'srp' => 'serbian',
266 'swa' => 'swahili',
267 'swe' => 'swedish',
268 'tgl' => 'tagalog',
269 'tur' => 'turkish',
270 'ukr' => 'ukrainian',
271 'urd' => 'urdu',
272 'uzb' => 'uzbek',
273 'vie' => 'vietnamese',
274 );
275
276 /**
277 * Returns the 2-letter ISO 639-1 code for the given language name.
278 *
279 * @param string $lang English language name like "swedish"
280 *
281 * @return string Two-letter language code (e.g. "sv") or NULL if not found
282 */
283 public static function nameToCode2($lang)
284 {
285 $lang = strtolower($lang);
286 if (!isset(self::$nameToCode2[$lang])) {
287 return null;
288 }
289 return self::$nameToCode2[$lang];
290 }
291
292 /**
293 * Returns the 3-letter ISO 639-2 code for the given language name.
294 *
295 * @param string $lang English language name like "swedish"
296 *
297 * @return string Three-letter language code (e.g. "swe") or NULL if not found
298 */
299 public static function nameToCode3($lang)
300 {
301 $lang = strtolower($lang);
302 if (!isset(self::$nameToCode3[$lang])) {
303 return null;
304 }
305 return self::$nameToCode3[$lang];
306 }
307
308 /**
309 * Returns the language name for the given 2-letter ISO 639-1 code.
310 *
311 * @param string $code Two-letter language code (e.g. "sv")
312 *
313 * @return string English language name like "swedish"
314 */
315 public static function code2ToName($code)
316 {
317 $lang = strtolower($code);
318 if (!isset(self::$code2ToName[$code])) {
319 return null;
320 }
321 return self::$code2ToName[$code];
322 }
323
324 /**
325 * Returns the language name for the given 3-letter ISO 639-2 code.
326 *
327 * @param string $code Three-letter language code (e.g. "swe")
328 *
329 * @return string English language name like "swedish"
330 */
331 public static function code3ToName($code)
332 {
333 $lang = strtolower($code);
334 if (!isset(self::$code3ToName[$code])) {
335 return null;
336 }
337 return self::$code3ToName[$code];
338 }
339} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
new file mode 100644
index 00000000..fb0e1e20
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -0,0 +1,347 @@
1<?php
2
3/**
4 * This class represents a text sample to be parsed.
5 *
6 * @category Text
7 * @package Text_LanguageDetect
8 * @author Nicholas Pisarro
9 * @copyright 2006
10 * @license BSD
11 * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/
14 */
15
16/**
17 * This class represents a text sample to be parsed.
18 *
19 * This separates the analysis of a text sample from the primary LanguageDetect
20 * class. After a new profile has been built, the data can be retrieved using
21 * the accessor functions.
22 *
23 * This class is intended to be used by the Text_LanguageDetect class, not
24 * end-users.
25 *
26 * @category Text
27 * @package Text_LanguageDetect
28 * @author Nicholas Pisarro
29 * @copyright 2006
30 * @license BSD
31 * @version release: 0.3.0
32 */
33class Text_LanguageDetect_Parser extends Text_LanguageDetect
34{
35 /**
36 * the piece of text being parsed
37 *
38 * @access private
39 * @var string
40 */
41 var $_string;
42
43 /**
44 * stores the trigram frequencies of the sample
45 *
46 * @access private
47 * @var string
48 */
49 var $_trigrams = array();
50
51 /**
52 * stores the trigram ranks of the sample
53 *
54 * @access private
55 * @var array
56 */
57 var $_trigram_ranks = array();
58
59 /**
60 * stores the unicode blocks of the sample
61 *
62 * @access private
63 * @var array
64 */
65 var $_unicode_blocks = array();
66
67 /**
68 * Whether the parser should compile the unicode ranges
69 *
70 * @access private
71 * @var bool
72 */
73 var $_compile_unicode = false;
74
75 /**
76 * Whether the parser should compile trigrams
77 *
78 * @access private
79 * @var bool
80 */
81 var $_compile_trigram = false;
82
83 /**
84 * Whether the trigram parser should pad the beginning of the string
85 *
86 * @access private
87 * @var bool
88 */
89 var $_trigram_pad_start = false;
90
91 /**
92 * Whether the unicode parser should skip non-alphabetical ascii chars
93 *
94 * @access private
95 * @var bool
96 */
97 var $_unicode_skip_symbols = true;
98
99 /**
100 * Constructor
101 *
102 * @access private
103 * @param string $string string to be parsed
104 */
105 function Text_LanguageDetect_Parser($string) {
106 $this->_string = $string;
107 }
108
109 /**
110 * Returns true if a string is suitable for parsing
111 *
112 * @param string $str input string to test
113 * @return bool true if acceptable, false if not
114 */
115 public static function validateString($str) {
116 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
117 return true;
118 } else {
119 return false;
120 }
121 }
122
123 /**
124 * turn on/off trigram counting
125 *
126 * @access public
127 * @param bool $bool true for on, false for off
128 */
129 function prepareTrigram($bool = true)
130 {
131 $this->_compile_trigram = $bool;
132 }
133
134 /**
135 * turn on/off unicode block counting
136 *
137 * @access public
138 * @param bool $bool true for on, false for off
139 */
140 function prepareUnicode($bool = true)
141 {
142 $this->_compile_unicode = $bool;
143 }
144
145 /**
146 * turn on/off padding the beginning of the sample string
147 *
148 * @access public
149 * @param bool $bool true for on, false for off
150 */
151 function setPadStart($bool = true)
152 {
153 $this->_trigram_pad_start = $bool;
154 }
155
156 /**
157 * Should the unicode block counter skip non-alphabetical ascii chars?
158 *
159 * @access public
160 * @param bool $bool true for on, false for off
161 */
162 function setUnicodeSkipSymbols($bool = true)
163 {
164 $this->_unicode_skip_symbols = $bool;
165 }
166
167 /**
168 * Returns the trigram ranks for the text sample
169 *
170 * @access public
171 * @return array trigram ranks in the text sample
172 */
173 function &getTrigramRanks()
174 {
175 return $this->_trigram_ranks;
176 }
177
178 /**
179 * Return the trigram freqency table
180 *
181 * only used in testing to make sure the parser is working
182 *
183 * @access public
184 * @return array trigram freqencies in the text sample
185 */
186 function &getTrigramFreqs()
187 {
188 return $this->_trigram;
189 }
190
191 /**
192 * returns the array of unicode blocks
193 *
194 * @access public
195 * @return array unicode blocks in the text sample
196 */
197 function &getUnicodeBlocks()
198 {
199 return $this->_unicode_blocks;
200 }
201
202 /**
203 * Executes the parsing operation
204 *
205 * Be sure to call the set*() functions to set options and the
206 * prepare*() functions first to tell it what kind of data to compute
207 *
208 * Afterwards the get*() functions can be used to access the compiled
209 * information.
210 *
211 * @access public
212 */
213 function analyze()
214 {
215 $len = strlen($this->_string);
216 $byte_counter = 0;
217
218
219 // unicode startup
220 if ($this->_compile_unicode) {
221 $blocks = $this->_read_unicode_block_db();
222 $block_count = count($blocks);
223
224 $skipped_count = 0;
225 $unicode_chars = array();
226 }
227
228 // trigram startup
229 if ($this->_compile_trigram) {
230 // initialize them as blank so the parser will skip the first two
231 // (since it skips trigrams with more than 2 contiguous spaces)
232 $a = ' ';
233 $b = ' ';
234
235 // kludge
236 // if it finds a valid trigram to start and the start pad option is
237 // off, then set a variable that will be used to reduce this
238 // trigram after parsing has finished
239 if (!$this->_trigram_pad_start) {
240 $a = $this->_next_char($this->_string, $byte_counter, true);
241
242 if ($a != ' ') {
243 $b = $this->_next_char($this->_string, $byte_counter, true);
244 $dropone = " $a$b";
245 }
246
247 $byte_counter = 0;
248 $a = ' ';
249 $b = ' ';
250 }
251 }
252
253 while ($byte_counter < $len) {
254 $char = $this->_next_char($this->_string, $byte_counter, true);
255
256
257 // language trigram detection
258 if ($this->_compile_trigram) {
259 if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
260 if (!isset($this->_trigram[$a . $b . $char])) {
261 $this->_trigram[$a . $b . $char] = 1;
262 } else {
263 $this->_trigram[$a . $b . $char]++;
264 }
265 }
266
267 $a = $b;
268 $b = $char;
269 }
270
271 // unicode block detection
272 if ($this->_compile_unicode) {
273 if ($this->_unicode_skip_symbols
274 && strlen($char) == 1
275 && ($char < 'A' || $char > 'z'
276 || ($char > 'Z' && $char < 'a'))
277 && $char != "'") { // does not skip the apostrophe
278 // since it's included in the language
279 // models
280
281 $skipped_count++;
282 continue;
283 }
284
285 // build an array of all the characters
286 if (isset($unicode_chars[$char])) {
287 $unicode_chars[$char]++;
288 } else {
289 $unicode_chars[$char] = 1;
290 }
291 }
292
293 // todo: add byte detection here
294 }
295
296 // unicode cleanup
297 if ($this->_compile_unicode) {
298 foreach ($unicode_chars as $utf8_char => $count) {
299 $search_result = $this->_unicode_block_name(
300 $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
301
302 if ($search_result != -1) {
303 $block_name = $search_result[2];
304 } else {
305 $block_name = '[Malformatted]';
306 }
307
308 if (isset($this->_unicode_blocks[$block_name])) {
309 $this->_unicode_blocks[$block_name] += $count;
310 } else {
311 $this->_unicode_blocks[$block_name] = $count;
312 }
313 }
314 }
315
316
317 // trigram cleanup
318 if ($this->_compile_trigram) {
319 // pad the end
320 if ($b != ' ') {
321 if (!isset($this->_trigram["$a$b "])) {
322 $this->_trigram["$a$b "] = 1;
323 } else {
324 $this->_trigram["$a$b "]++;
325 }
326 }
327
328 // perl compatibility; Language::Guess does not pad the beginning
329 // kludge
330 if (isset($dropone)) {
331 if ($this->_trigram[$dropone] == 1) {
332 unset($this->_trigram[$dropone]);
333 } else {
334 $this->_trigram[$dropone]--;
335 }
336 }
337
338 if (!empty($this->_trigram)) {
339 $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
340 } else {
341 $this->_trigram_ranks = array();
342 }
343 }
344 }
345}
346
347/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file