diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
commit | a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch) | |
tree | 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/language-detect/LanguageDetect | |
parent | 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff) | |
parent | 8038b38802769031e050c753fc0a388a2276629e (diff) | |
download | wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip |
Merge pull request #712 from wallabag/dev1.7.0
1.7, call me "Premium version"
Diffstat (limited to 'inc/3rdparty/libraries/language-detect/LanguageDetect')
3 files changed, 743 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php new file mode 100644 index 00000000..196d994f --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php | |||
@@ -0,0 +1,57 @@ | |||
1 | <?php | ||
2 | class Text_LanguageDetect_Exception extends Exception | ||
3 | { | ||
4 | /** | ||
5 | * Database file could not be found | ||
6 | */ | ||
7 | const DB_NOT_FOUND = 10; | ||
8 | |||
9 | /** | ||
10 | * Database file found, but not readable | ||
11 | */ | ||
12 | const DB_NOT_READABLE = 11; | ||
13 | |||
14 | /** | ||
15 | * Database file is empty | ||
16 | */ | ||
17 | const DB_EMPTY = 12; | ||
18 | |||
19 | /** | ||
20 | * Database contents is not a PHP array | ||
21 | */ | ||
22 | const DB_NOT_ARRAY = 13; | ||
23 | |||
24 | /** | ||
25 | * Magic quotes are activated | ||
26 | */ | ||
27 | const MAGIC_QUOTES = 14; | ||
28 | |||
29 | |||
30 | /** | ||
31 | * Parameter of invalid type passed to method | ||
32 | */ | ||
33 | const PARAM_TYPE = 20; | ||
34 | |||
35 | /** | ||
36 | * Character in parameter is invalid | ||
37 | */ | ||
38 | const INVALID_CHAR = 21; | ||
39 | |||
40 | |||
41 | /** | ||
42 | * Language is not in the database | ||
43 | */ | ||
44 | const UNKNOWN_LANGUAGE = 30; | ||
45 | |||
46 | |||
47 | /** | ||
48 | * Error during block detection | ||
49 | */ | ||
50 | const BLOCK_DETECTION = 40; | ||
51 | |||
52 | |||
53 | /** | ||
54 | * Error while clustering languages | ||
55 | */ | ||
56 | const NO_HIGHEST_KEY = 50; | ||
57 | } | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php new file mode 100644 index 00000000..05b0590d --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php | |||
@@ -0,0 +1,339 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Part of Text_LanguageDetect | ||
4 | * | ||
5 | * PHP version 5 | ||
6 | * | ||
7 | * @category Text | ||
8 | * @package Text_LanguageDetect | ||
9 | * @author Christian Weiske <cweiske@php.net> | ||
10 | * @copyright 2011 Christian Weiske <cweiske@php.net> | ||
11 | * @license http://www.debian.org/misc/bsd.license BSD | ||
12 | * @version SVN: $Id$ | ||
13 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
14 | */ | ||
15 | |||
16 | /** | ||
17 | * Provides a mapping between the languages from lang.dat and the | ||
18 | * ISO 639-1 and ISO-639-2 codes. | ||
19 | * | ||
20 | * Note that this class contains only languages that exist in lang.dat. | ||
21 | * | ||
22 | * @category Text | ||
23 | * @package Text_LanguageDetect | ||
24 | * @author Christian Weiske <cweiske@php.net> | ||
25 | * @copyright 2011 Christian Weiske <cweiske@php.net> | ||
26 | * @license http://www.debian.org/misc/bsd.license BSD | ||
27 | * @link http://www.loc.gov/standards/iso639-2/php/code_list.php | ||
28 | */ | ||
29 | class Text_LanguageDetect_ISO639 | ||
30 | { | ||
31 | /** | ||
32 | * Maps all language names from the language database to the | ||
33 | * ISO 639-1 2-letter language code. | ||
34 | * | ||
35 | * NULL indicates that there is no 2-letter code. | ||
36 | * | ||
37 | * @var array | ||
38 | */ | ||
39 | public static $nameToCode2 = array( | ||
40 | 'albanian' => 'sq', | ||
41 | 'arabic' => 'ar', | ||
42 | 'azeri' => 'az', | ||
43 | 'bengali' => 'bn', | ||
44 | 'bulgarian' => 'bg', | ||
45 | 'cebuano' => null, | ||
46 | 'croatian' => 'hr', | ||
47 | 'czech' => 'cs', | ||
48 | 'danish' => 'da', | ||
49 | 'dutch' => 'nl', | ||
50 | 'english' => 'en', | ||
51 | 'estonian' => 'et', | ||
52 | 'farsi' => 'fa', | ||
53 | 'finnish' => 'fi', | ||
54 | 'french' => 'fr', | ||
55 | 'german' => 'de', | ||
56 | 'hausa' => 'ha', | ||
57 | 'hawaiian' => null, | ||
58 | 'hindi' => 'hi', | ||
59 | 'hungarian' => 'hu', | ||
60 | 'icelandic' => 'is', | ||
61 | 'indonesian' => 'id', | ||
62 | 'italian' => 'it', | ||
63 | 'kazakh' => 'kk', | ||
64 | 'kyrgyz' => 'ky', | ||
65 | 'latin' => 'la', | ||
66 | 'latvian' => 'lv', | ||
67 | 'lithuanian' => 'lt', | ||
68 | 'macedonian' => 'mk', | ||
69 | 'mongolian' => 'mn', | ||
70 | 'nepali' => 'ne', | ||
71 | 'norwegian' => 'no', | ||
72 | 'pashto' => 'ps', | ||
73 | 'pidgin' => null, | ||
74 | 'polish' => 'pl', | ||
75 | 'portuguese' => 'pt', | ||
76 | 'romanian' => 'ro', | ||
77 | 'russian' => 'ru', | ||
78 | 'serbian' => 'sr', | ||
79 | 'slovak' => 'sk', | ||
80 | 'slovene' => 'sl', | ||
81 | 'somali' => 'so', | ||
82 | 'spanish' => 'es', | ||
83 | 'swahili' => 'sw', | ||
84 | 'swedish' => 'sv', | ||
85 | 'tagalog' => 'tl', | ||
86 | 'turkish' => 'tr', | ||
87 | 'ukrainian' => 'uk', | ||
88 | 'urdu' => 'ur', | ||
89 | 'uzbek' => 'uz', | ||
90 | 'vietnamese' => 'vi', | ||
91 | 'welsh' => 'cy', | ||
92 | ); | ||
93 | |||
94 | /** | ||
95 | * Maps all language names from the language database to the | ||
96 | * ISO 639-2 3-letter language code. | ||
97 | * | ||
98 | * @var array | ||
99 | */ | ||
100 | public static $nameToCode3 = array( | ||
101 | 'albanian' => 'sqi', | ||
102 | 'arabic' => 'ara', | ||
103 | 'azeri' => 'aze', | ||
104 | 'bengali' => 'ben', | ||
105 | 'bulgarian' => 'bul', | ||
106 | 'cebuano' => 'ceb', | ||
107 | 'croatian' => 'hrv', | ||
108 | 'czech' => 'ces', | ||
109 | 'danish' => 'dan', | ||
110 | 'dutch' => 'nld', | ||
111 | 'english' => 'eng', | ||
112 | 'estonian' => 'est', | ||
113 | 'farsi' => 'fas', | ||
114 | 'finnish' => 'fin', | ||
115 | 'french' => 'fra', | ||
116 | 'german' => 'deu', | ||
117 | 'hausa' => 'hau', | ||
118 | 'hawaiian' => 'haw', | ||
119 | 'hindi' => 'hin', | ||
120 | 'hungarian' => 'hun', | ||
121 | 'icelandic' => 'isl', | ||
122 | 'indonesian' => 'ind', | ||
123 | 'italian' => 'ita', | ||
124 | 'kazakh' => 'kaz', | ||
125 | 'kyrgyz' => 'kir', | ||
126 | 'latin' => 'lat', | ||
127 | 'latvian' => 'lav', | ||
128 | 'lithuanian' => 'lit', | ||
129 | 'macedonian' => 'mkd', | ||
130 | 'mongolian' => 'mon', | ||
131 | 'nepali' => 'nep', | ||
132 | 'norwegian' => 'nor', | ||
133 | 'pashto' => 'pus', | ||
134 | 'pidgin' => 'crp', | ||
135 | 'polish' => 'pol', | ||
136 | 'portuguese' => 'por', | ||
137 | 'romanian' => 'ron', | ||
138 | 'russian' => 'rus', | ||
139 | 'serbian' => 'srp', | ||
140 | 'slovak' => 'slk', | ||
141 | 'slovene' => 'slv', | ||
142 | 'somali' => 'som', | ||
143 | 'spanish' => 'spa', | ||
144 | 'swahili' => 'swa', | ||
145 | 'swedish' => 'swe', | ||
146 | 'tagalog' => 'tgl', | ||
147 | 'turkish' => 'tur', | ||
148 | 'ukrainian' => 'ukr', | ||
149 | 'urdu' => 'urd', | ||
150 | 'uzbek' => 'uzb', | ||
151 | 'vietnamese' => 'vie', | ||
152 | 'welsh' => 'cym', | ||
153 | ); | ||
154 | |||
155 | /** | ||
156 | * Maps ISO 639-1 2-letter language codes to the language names | ||
157 | * in the language database | ||
158 | * | ||
159 | * Not all languages have a 2 letter code, so some are missing | ||
160 | * | ||
161 | * @var array | ||
162 | */ | ||
163 | public static $code2ToName = array( | ||
164 | 'ar' => 'arabic', | ||
165 | 'az' => 'azeri', | ||
166 | 'bg' => 'bulgarian', | ||
167 | 'bn' => 'bengali', | ||
168 | 'cs' => 'czech', | ||
169 | 'cy' => 'welsh', | ||
170 | 'da' => 'danish', | ||
171 | 'de' => 'german', | ||
172 | 'en' => 'english', | ||
173 | 'es' => 'spanish', | ||
174 | 'et' => 'estonian', | ||
175 | 'fa' => 'farsi', | ||
176 | 'fi' => 'finnish', | ||
177 | 'fr' => 'french', | ||
178 | 'ha' => 'hausa', | ||
179 | 'hi' => 'hindi', | ||
180 | 'hr' => 'croatian', | ||
181 | 'hu' => 'hungarian', | ||
182 | 'id' => 'indonesian', | ||
183 | 'is' => 'icelandic', | ||
184 | 'it' => 'italian', | ||
185 | 'kk' => 'kazakh', | ||
186 | 'ky' => 'kyrgyz', | ||
187 | 'la' => 'latin', | ||
188 | 'lt' => 'lithuanian', | ||
189 | 'lv' => 'latvian', | ||
190 | 'mk' => 'macedonian', | ||
191 | 'mn' => 'mongolian', | ||
192 | 'ne' => 'nepali', | ||
193 | 'nl' => 'dutch', | ||
194 | 'no' => 'norwegian', | ||
195 | 'pl' => 'polish', | ||
196 | 'ps' => 'pashto', | ||
197 | 'pt' => 'portuguese', | ||
198 | 'ro' => 'romanian', | ||
199 | 'ru' => 'russian', | ||
200 | 'sk' => 'slovak', | ||
201 | 'sl' => 'slovene', | ||
202 | 'so' => 'somali', | ||
203 | 'sq' => 'albanian', | ||
204 | 'sr' => 'serbian', | ||
205 | 'sv' => 'swedish', | ||
206 | 'sw' => 'swahili', | ||
207 | 'tl' => 'tagalog', | ||
208 | 'tr' => 'turkish', | ||
209 | 'uk' => 'ukrainian', | ||
210 | 'ur' => 'urdu', | ||
211 | 'uz' => 'uzbek', | ||
212 | 'vi' => 'vietnamese', | ||
213 | ); | ||
214 | |||
215 | /** | ||
216 | * Maps ISO 639-2 3-letter language codes to the language names | ||
217 | * in the language database. | ||
218 | * | ||
219 | * @var array | ||
220 | */ | ||
221 | public static $code3ToName = array( | ||
222 | 'ara' => 'arabic', | ||
223 | 'aze' => 'azeri', | ||
224 | 'ben' => 'bengali', | ||
225 | 'bul' => 'bulgarian', | ||
226 | 'ceb' => 'cebuano', | ||
227 | 'ces' => 'czech', | ||
228 | 'crp' => 'pidgin', | ||
229 | 'cym' => 'welsh', | ||
230 | 'dan' => 'danish', | ||
231 | 'deu' => 'german', | ||
232 | 'eng' => 'english', | ||
233 | 'est' => 'estonian', | ||
234 | 'fas' => 'farsi', | ||
235 | 'fin' => 'finnish', | ||
236 | 'fra' => 'french', | ||
237 | 'hau' => 'hausa', | ||
238 | 'haw' => 'hawaiian', | ||
239 | 'hin' => 'hindi', | ||
240 | 'hrv' => 'croatian', | ||
241 | 'hun' => 'hungarian', | ||
242 | 'ind' => 'indonesian', | ||
243 | 'isl' => 'icelandic', | ||
244 | 'ita' => 'italian', | ||
245 | 'kaz' => 'kazakh', | ||
246 | 'kir' => 'kyrgyz', | ||
247 | 'lat' => 'latin', | ||
248 | 'lav' => 'latvian', | ||
249 | 'lit' => 'lithuanian', | ||
250 | 'mkd' => 'macedonian', | ||
251 | 'mon' => 'mongolian', | ||
252 | 'nep' => 'nepali', | ||
253 | 'nld' => 'dutch', | ||
254 | 'nor' => 'norwegian', | ||
255 | 'pol' => 'polish', | ||
256 | 'por' => 'portuguese', | ||
257 | 'pus' => 'pashto', | ||
258 | 'rom' => 'romanian', | ||
259 | 'rus' => 'russian', | ||
260 | 'slk' => 'slovak', | ||
261 | 'slv' => 'slovene', | ||
262 | 'som' => 'somali', | ||
263 | 'spa' => 'spanish', | ||
264 | 'sqi' => 'albanian', | ||
265 | 'srp' => 'serbian', | ||
266 | 'swa' => 'swahili', | ||
267 | 'swe' => 'swedish', | ||
268 | 'tgl' => 'tagalog', | ||
269 | 'tur' => 'turkish', | ||
270 | 'ukr' => 'ukrainian', | ||
271 | 'urd' => 'urdu', | ||
272 | 'uzb' => 'uzbek', | ||
273 | 'vie' => 'vietnamese', | ||
274 | ); | ||
275 | |||
276 | /** | ||
277 | * Returns the 2-letter ISO 639-1 code for the given language name. | ||
278 | * | ||
279 | * @param string $lang English language name like "swedish" | ||
280 | * | ||
281 | * @return string Two-letter language code (e.g. "sv") or NULL if not found | ||
282 | */ | ||
283 | public static function nameToCode2($lang) | ||
284 | { | ||
285 | $lang = strtolower($lang); | ||
286 | if (!isset(self::$nameToCode2[$lang])) { | ||
287 | return null; | ||
288 | } | ||
289 | return self::$nameToCode2[$lang]; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * Returns the 3-letter ISO 639-2 code for the given language name. | ||
294 | * | ||
295 | * @param string $lang English language name like "swedish" | ||
296 | * | ||
297 | * @return string Three-letter language code (e.g. "swe") or NULL if not found | ||
298 | */ | ||
299 | public static function nameToCode3($lang) | ||
300 | { | ||
301 | $lang = strtolower($lang); | ||
302 | if (!isset(self::$nameToCode3[$lang])) { | ||
303 | return null; | ||
304 | } | ||
305 | return self::$nameToCode3[$lang]; | ||
306 | } | ||
307 | |||
308 | /** | ||
309 | * Returns the language name for the given 2-letter ISO 639-1 code. | ||
310 | * | ||
311 | * @param string $code Two-letter language code (e.g. "sv") | ||
312 | * | ||
313 | * @return string English language name like "swedish" | ||
314 | */ | ||
315 | public static function code2ToName($code) | ||
316 | { | ||
317 | $lang = strtolower($code); | ||
318 | if (!isset(self::$code2ToName[$code])) { | ||
319 | return null; | ||
320 | } | ||
321 | return self::$code2ToName[$code]; | ||
322 | } | ||
323 | |||
324 | /** | ||
325 | * Returns the language name for the given 3-letter ISO 639-2 code. | ||
326 | * | ||
327 | * @param string $code Three-letter language code (e.g. "swe") | ||
328 | * | ||
329 | * @return string English language name like "swedish" | ||
330 | */ | ||
331 | public static function code3ToName($code) | ||
332 | { | ||
333 | $lang = strtolower($code); | ||
334 | if (!isset(self::$code3ToName[$code])) { | ||
335 | return null; | ||
336 | } | ||
337 | return self::$code3ToName[$code]; | ||
338 | } | ||
339 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php new file mode 100644 index 00000000..fb0e1e20 --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php | |||
@@ -0,0 +1,347 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * This class represents a text sample to be parsed. | ||
5 | * | ||
6 | * @category Text | ||
7 | * @package Text_LanguageDetect | ||
8 | * @author Nicholas Pisarro | ||
9 | * @copyright 2006 | ||
10 | * @license BSD | ||
11 | * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ | ||
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
13 | * @link http://langdetect.blogspot.com/ | ||
14 | */ | ||
15 | |||
16 | /** | ||
17 | * This class represents a text sample to be parsed. | ||
18 | * | ||
19 | * This separates the analysis of a text sample from the primary LanguageDetect | ||
20 | * class. After a new profile has been built, the data can be retrieved using | ||
21 | * the accessor functions. | ||
22 | * | ||
23 | * This class is intended to be used by the Text_LanguageDetect class, not | ||
24 | * end-users. | ||
25 | * | ||
26 | * @category Text | ||
27 | * @package Text_LanguageDetect | ||
28 | * @author Nicholas Pisarro | ||
29 | * @copyright 2006 | ||
30 | * @license BSD | ||
31 | * @version release: 0.3.0 | ||
32 | */ | ||
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | ||
34 | { | ||
35 | /** | ||
36 | * the piece of text being parsed | ||
37 | * | ||
38 | * @access private | ||
39 | * @var string | ||
40 | */ | ||
41 | var $_string; | ||
42 | |||
43 | /** | ||
44 | * stores the trigram frequencies of the sample | ||
45 | * | ||
46 | * @access private | ||
47 | * @var string | ||
48 | */ | ||
49 | var $_trigrams = array(); | ||
50 | |||
51 | /** | ||
52 | * stores the trigram ranks of the sample | ||
53 | * | ||
54 | * @access private | ||
55 | * @var array | ||
56 | */ | ||
57 | var $_trigram_ranks = array(); | ||
58 | |||
59 | /** | ||
60 | * stores the unicode blocks of the sample | ||
61 | * | ||
62 | * @access private | ||
63 | * @var array | ||
64 | */ | ||
65 | var $_unicode_blocks = array(); | ||
66 | |||
67 | /** | ||
68 | * Whether the parser should compile the unicode ranges | ||
69 | * | ||
70 | * @access private | ||
71 | * @var bool | ||
72 | */ | ||
73 | var $_compile_unicode = false; | ||
74 | |||
75 | /** | ||
76 | * Whether the parser should compile trigrams | ||
77 | * | ||
78 | * @access private | ||
79 | * @var bool | ||
80 | */ | ||
81 | var $_compile_trigram = false; | ||
82 | |||
83 | /** | ||
84 | * Whether the trigram parser should pad the beginning of the string | ||
85 | * | ||
86 | * @access private | ||
87 | * @var bool | ||
88 | */ | ||
89 | var $_trigram_pad_start = false; | ||
90 | |||
91 | /** | ||
92 | * Whether the unicode parser should skip non-alphabetical ascii chars | ||
93 | * | ||
94 | * @access private | ||
95 | * @var bool | ||
96 | */ | ||
97 | var $_unicode_skip_symbols = true; | ||
98 | |||
99 | /** | ||
100 | * Constructor | ||
101 | * | ||
102 | * @access private | ||
103 | * @param string $string string to be parsed | ||
104 | */ | ||
105 | function Text_LanguageDetect_Parser($string) { | ||
106 | $this->_string = $string; | ||
107 | } | ||
108 | |||
109 | /** | ||
110 | * Returns true if a string is suitable for parsing | ||
111 | * | ||
112 | * @param string $str input string to test | ||
113 | * @return bool true if acceptable, false if not | ||
114 | */ | ||
115 | public static function validateString($str) { | ||
116 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | ||
117 | return true; | ||
118 | } else { | ||
119 | return false; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * turn on/off trigram counting | ||
125 | * | ||
126 | * @access public | ||
127 | * @param bool $bool true for on, false for off | ||
128 | */ | ||
129 | function prepareTrigram($bool = true) | ||
130 | { | ||
131 | $this->_compile_trigram = $bool; | ||
132 | } | ||
133 | |||
134 | /** | ||
135 | * turn on/off unicode block counting | ||
136 | * | ||
137 | * @access public | ||
138 | * @param bool $bool true for on, false for off | ||
139 | */ | ||
140 | function prepareUnicode($bool = true) | ||
141 | { | ||
142 | $this->_compile_unicode = $bool; | ||
143 | } | ||
144 | |||
145 | /** | ||
146 | * turn on/off padding the beginning of the sample string | ||
147 | * | ||
148 | * @access public | ||
149 | * @param bool $bool true for on, false for off | ||
150 | */ | ||
151 | function setPadStart($bool = true) | ||
152 | { | ||
153 | $this->_trigram_pad_start = $bool; | ||
154 | } | ||
155 | |||
156 | /** | ||
157 | * Should the unicode block counter skip non-alphabetical ascii chars? | ||
158 | * | ||
159 | * @access public | ||
160 | * @param bool $bool true for on, false for off | ||
161 | */ | ||
162 | function setUnicodeSkipSymbols($bool = true) | ||
163 | { | ||
164 | $this->_unicode_skip_symbols = $bool; | ||
165 | } | ||
166 | |||
167 | /** | ||
168 | * Returns the trigram ranks for the text sample | ||
169 | * | ||
170 | * @access public | ||
171 | * @return array trigram ranks in the text sample | ||
172 | */ | ||
173 | function &getTrigramRanks() | ||
174 | { | ||
175 | return $this->_trigram_ranks; | ||
176 | } | ||
177 | |||
178 | /** | ||
179 | * Return the trigram freqency table | ||
180 | * | ||
181 | * only used in testing to make sure the parser is working | ||
182 | * | ||
183 | * @access public | ||
184 | * @return array trigram freqencies in the text sample | ||
185 | */ | ||
186 | function &getTrigramFreqs() | ||
187 | { | ||
188 | return $this->_trigram; | ||
189 | } | ||
190 | |||
191 | /** | ||
192 | * returns the array of unicode blocks | ||
193 | * | ||
194 | * @access public | ||
195 | * @return array unicode blocks in the text sample | ||
196 | */ | ||
197 | function &getUnicodeBlocks() | ||
198 | { | ||
199 | return $this->_unicode_blocks; | ||
200 | } | ||
201 | |||
202 | /** | ||
203 | * Executes the parsing operation | ||
204 | * | ||
205 | * Be sure to call the set*() functions to set options and the | ||
206 | * prepare*() functions first to tell it what kind of data to compute | ||
207 | * | ||
208 | * Afterwards the get*() functions can be used to access the compiled | ||
209 | * information. | ||
210 | * | ||
211 | * @access public | ||
212 | */ | ||
213 | function analyze() | ||
214 | { | ||
215 | $len = strlen($this->_string); | ||
216 | $byte_counter = 0; | ||
217 | |||
218 | |||
219 | // unicode startup | ||
220 | if ($this->_compile_unicode) { | ||
221 | $blocks = $this->_read_unicode_block_db(); | ||
222 | $block_count = count($blocks); | ||
223 | |||
224 | $skipped_count = 0; | ||
225 | $unicode_chars = array(); | ||
226 | } | ||
227 | |||
228 | // trigram startup | ||
229 | if ($this->_compile_trigram) { | ||
230 | // initialize them as blank so the parser will skip the first two | ||
231 | // (since it skips trigrams with more than 2 contiguous spaces) | ||
232 | $a = ' '; | ||
233 | $b = ' '; | ||
234 | |||
235 | // kludge | ||
236 | // if it finds a valid trigram to start and the start pad option is | ||
237 | // off, then set a variable that will be used to reduce this | ||
238 | // trigram after parsing has finished | ||
239 | if (!$this->_trigram_pad_start) { | ||
240 | $a = $this->_next_char($this->_string, $byte_counter, true); | ||
241 | |||
242 | if ($a != ' ') { | ||
243 | $b = $this->_next_char($this->_string, $byte_counter, true); | ||
244 | $dropone = " $a$b"; | ||
245 | } | ||
246 | |||
247 | $byte_counter = 0; | ||
248 | $a = ' '; | ||
249 | $b = ' '; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | while ($byte_counter < $len) { | ||
254 | $char = $this->_next_char($this->_string, $byte_counter, true); | ||
255 | |||
256 | |||
257 | // language trigram detection | ||
258 | if ($this->_compile_trigram) { | ||
259 | if (!($b == ' ' && ($a == ' ' || $char == ' '))) { | ||
260 | if (!isset($this->_trigram[$a . $b . $char])) { | ||
261 | $this->_trigram[$a . $b . $char] = 1; | ||
262 | } else { | ||
263 | $this->_trigram[$a . $b . $char]++; | ||
264 | } | ||
265 | } | ||
266 | |||
267 | $a = $b; | ||
268 | $b = $char; | ||
269 | } | ||
270 | |||
271 | // unicode block detection | ||
272 | if ($this->_compile_unicode) { | ||
273 | if ($this->_unicode_skip_symbols | ||
274 | && strlen($char) == 1 | ||
275 | && ($char < 'A' || $char > 'z' | ||
276 | || ($char > 'Z' && $char < 'a')) | ||
277 | && $char != "'") { // does not skip the apostrophe | ||
278 | // since it's included in the language | ||
279 | // models | ||
280 | |||
281 | $skipped_count++; | ||
282 | continue; | ||
283 | } | ||
284 | |||
285 | // build an array of all the characters | ||
286 | if (isset($unicode_chars[$char])) { | ||
287 | $unicode_chars[$char]++; | ||
288 | } else { | ||
289 | $unicode_chars[$char] = 1; | ||
290 | } | ||
291 | } | ||
292 | |||
293 | // todo: add byte detection here | ||
294 | } | ||
295 | |||
296 | // unicode cleanup | ||
297 | if ($this->_compile_unicode) { | ||
298 | foreach ($unicode_chars as $utf8_char => $count) { | ||
299 | $search_result = $this->_unicode_block_name( | ||
300 | $this->_utf8char2unicode($utf8_char), $blocks, $block_count); | ||
301 | |||
302 | if ($search_result != -1) { | ||
303 | $block_name = $search_result[2]; | ||
304 | } else { | ||
305 | $block_name = '[Malformatted]'; | ||
306 | } | ||
307 | |||
308 | if (isset($this->_unicode_blocks[$block_name])) { | ||
309 | $this->_unicode_blocks[$block_name] += $count; | ||
310 | } else { | ||
311 | $this->_unicode_blocks[$block_name] = $count; | ||
312 | } | ||
313 | } | ||
314 | } | ||
315 | |||
316 | |||
317 | // trigram cleanup | ||
318 | if ($this->_compile_trigram) { | ||
319 | // pad the end | ||
320 | if ($b != ' ') { | ||
321 | if (!isset($this->_trigram["$a$b "])) { | ||
322 | $this->_trigram["$a$b "] = 1; | ||
323 | } else { | ||
324 | $this->_trigram["$a$b "]++; | ||
325 | } | ||
326 | } | ||
327 | |||
328 | // perl compatibility; Language::Guess does not pad the beginning | ||
329 | // kludge | ||
330 | if (isset($dropone)) { | ||
331 | if ($this->_trigram[$dropone] == 1) { | ||
332 | unset($this->_trigram[$dropone]); | ||
333 | } else { | ||
334 | $this->_trigram[$dropone]--; | ||
335 | } | ||
336 | } | ||
337 | |||
338 | if (!empty($this->_trigram)) { | ||
339 | $this->_trigram_ranks = $this->_arr_rank($this->_trigram); | ||
340 | } else { | ||
341 | $this->_trigram_ranks = array(); | ||
342 | } | ||
343 | } | ||
344 | } | ||
345 | } | ||
346 | |||
347 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file | ||