diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
commit | a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch) | |
tree | 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php | |
parent | 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff) | |
parent | 8038b38802769031e050c753fc0a388a2276629e (diff) | |
download | wallabag-1.7.0.tar.gz wallabag-1.7.0.tar.zst wallabag-1.7.0.zip |
Merge pull request #712 from wallabag/dev1.7.0
1.7, call me "Premium version"
Diffstat (limited to 'inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php')
-rw-r--r-- | inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php new file mode 100644 index 00000000..fb0e1e20 --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php | |||
@@ -0,0 +1,347 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * This class represents a text sample to be parsed. | ||
5 | * | ||
6 | * @category Text | ||
7 | * @package Text_LanguageDetect | ||
8 | * @author Nicholas Pisarro | ||
9 | * @copyright 2006 | ||
10 | * @license BSD | ||
11 | * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ | ||
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
13 | * @link http://langdetect.blogspot.com/ | ||
14 | */ | ||
15 | |||
16 | /** | ||
17 | * This class represents a text sample to be parsed. | ||
18 | * | ||
19 | * This separates the analysis of a text sample from the primary LanguageDetect | ||
20 | * class. After a new profile has been built, the data can be retrieved using | ||
21 | * the accessor functions. | ||
22 | * | ||
23 | * This class is intended to be used by the Text_LanguageDetect class, not | ||
24 | * end-users. | ||
25 | * | ||
26 | * @category Text | ||
27 | * @package Text_LanguageDetect | ||
28 | * @author Nicholas Pisarro | ||
29 | * @copyright 2006 | ||
30 | * @license BSD | ||
31 | * @version release: 0.3.0 | ||
32 | */ | ||
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | ||
34 | { | ||
35 | /** | ||
36 | * the piece of text being parsed | ||
37 | * | ||
38 | * @access private | ||
39 | * @var string | ||
40 | */ | ||
41 | var $_string; | ||
42 | |||
43 | /** | ||
44 | * stores the trigram frequencies of the sample | ||
45 | * | ||
46 | * @access private | ||
47 | * @var string | ||
48 | */ | ||
49 | var $_trigrams = array(); | ||
50 | |||
51 | /** | ||
52 | * stores the trigram ranks of the sample | ||
53 | * | ||
54 | * @access private | ||
55 | * @var array | ||
56 | */ | ||
57 | var $_trigram_ranks = array(); | ||
58 | |||
59 | /** | ||
60 | * stores the unicode blocks of the sample | ||
61 | * | ||
62 | * @access private | ||
63 | * @var array | ||
64 | */ | ||
65 | var $_unicode_blocks = array(); | ||
66 | |||
67 | /** | ||
68 | * Whether the parser should compile the unicode ranges | ||
69 | * | ||
70 | * @access private | ||
71 | * @var bool | ||
72 | */ | ||
73 | var $_compile_unicode = false; | ||
74 | |||
75 | /** | ||
76 | * Whether the parser should compile trigrams | ||
77 | * | ||
78 | * @access private | ||
79 | * @var bool | ||
80 | */ | ||
81 | var $_compile_trigram = false; | ||
82 | |||
83 | /** | ||
84 | * Whether the trigram parser should pad the beginning of the string | ||
85 | * | ||
86 | * @access private | ||
87 | * @var bool | ||
88 | */ | ||
89 | var $_trigram_pad_start = false; | ||
90 | |||
91 | /** | ||
92 | * Whether the unicode parser should skip non-alphabetical ascii chars | ||
93 | * | ||
94 | * @access private | ||
95 | * @var bool | ||
96 | */ | ||
97 | var $_unicode_skip_symbols = true; | ||
98 | |||
99 | /** | ||
100 | * Constructor | ||
101 | * | ||
102 | * @access private | ||
103 | * @param string $string string to be parsed | ||
104 | */ | ||
105 | function Text_LanguageDetect_Parser($string) { | ||
106 | $this->_string = $string; | ||
107 | } | ||
108 | |||
109 | /** | ||
110 | * Returns true if a string is suitable for parsing | ||
111 | * | ||
112 | * @param string $str input string to test | ||
113 | * @return bool true if acceptable, false if not | ||
114 | */ | ||
115 | public static function validateString($str) { | ||
116 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | ||
117 | return true; | ||
118 | } else { | ||
119 | return false; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * turn on/off trigram counting | ||
125 | * | ||
126 | * @access public | ||
127 | * @param bool $bool true for on, false for off | ||
128 | */ | ||
129 | function prepareTrigram($bool = true) | ||
130 | { | ||
131 | $this->_compile_trigram = $bool; | ||
132 | } | ||
133 | |||
134 | /** | ||
135 | * turn on/off unicode block counting | ||
136 | * | ||
137 | * @access public | ||
138 | * @param bool $bool true for on, false for off | ||
139 | */ | ||
140 | function prepareUnicode($bool = true) | ||
141 | { | ||
142 | $this->_compile_unicode = $bool; | ||
143 | } | ||
144 | |||
145 | /** | ||
146 | * turn on/off padding the beginning of the sample string | ||
147 | * | ||
148 | * @access public | ||
149 | * @param bool $bool true for on, false for off | ||
150 | */ | ||
151 | function setPadStart($bool = true) | ||
152 | { | ||
153 | $this->_trigram_pad_start = $bool; | ||
154 | } | ||
155 | |||
156 | /** | ||
157 | * Should the unicode block counter skip non-alphabetical ascii chars? | ||
158 | * | ||
159 | * @access public | ||
160 | * @param bool $bool true for on, false for off | ||
161 | */ | ||
162 | function setUnicodeSkipSymbols($bool = true) | ||
163 | { | ||
164 | $this->_unicode_skip_symbols = $bool; | ||
165 | } | ||
166 | |||
167 | /** | ||
168 | * Returns the trigram ranks for the text sample | ||
169 | * | ||
170 | * @access public | ||
171 | * @return array trigram ranks in the text sample | ||
172 | */ | ||
173 | function &getTrigramRanks() | ||
174 | { | ||
175 | return $this->_trigram_ranks; | ||
176 | } | ||
177 | |||
178 | /** | ||
179 | * Return the trigram freqency table | ||
180 | * | ||
181 | * only used in testing to make sure the parser is working | ||
182 | * | ||
183 | * @access public | ||
184 | * @return array trigram freqencies in the text sample | ||
185 | */ | ||
186 | function &getTrigramFreqs() | ||
187 | { | ||
188 | return $this->_trigram; | ||
189 | } | ||
190 | |||
191 | /** | ||
192 | * returns the array of unicode blocks | ||
193 | * | ||
194 | * @access public | ||
195 | * @return array unicode blocks in the text sample | ||
196 | */ | ||
197 | function &getUnicodeBlocks() | ||
198 | { | ||
199 | return $this->_unicode_blocks; | ||
200 | } | ||
201 | |||
202 | /** | ||
203 | * Executes the parsing operation | ||
204 | * | ||
205 | * Be sure to call the set*() functions to set options and the | ||
206 | * prepare*() functions first to tell it what kind of data to compute | ||
207 | * | ||
208 | * Afterwards the get*() functions can be used to access the compiled | ||
209 | * information. | ||
210 | * | ||
211 | * @access public | ||
212 | */ | ||
213 | function analyze() | ||
214 | { | ||
215 | $len = strlen($this->_string); | ||
216 | $byte_counter = 0; | ||
217 | |||
218 | |||
219 | // unicode startup | ||
220 | if ($this->_compile_unicode) { | ||
221 | $blocks = $this->_read_unicode_block_db(); | ||
222 | $block_count = count($blocks); | ||
223 | |||
224 | $skipped_count = 0; | ||
225 | $unicode_chars = array(); | ||
226 | } | ||
227 | |||
228 | // trigram startup | ||
229 | if ($this->_compile_trigram) { | ||
230 | // initialize them as blank so the parser will skip the first two | ||
231 | // (since it skips trigrams with more than 2 contiguous spaces) | ||
232 | $a = ' '; | ||
233 | $b = ' '; | ||
234 | |||
235 | // kludge | ||
236 | // if it finds a valid trigram to start and the start pad option is | ||
237 | // off, then set a variable that will be used to reduce this | ||
238 | // trigram after parsing has finished | ||
239 | if (!$this->_trigram_pad_start) { | ||
240 | $a = $this->_next_char($this->_string, $byte_counter, true); | ||
241 | |||
242 | if ($a != ' ') { | ||
243 | $b = $this->_next_char($this->_string, $byte_counter, true); | ||
244 | $dropone = " $a$b"; | ||
245 | } | ||
246 | |||
247 | $byte_counter = 0; | ||
248 | $a = ' '; | ||
249 | $b = ' '; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | while ($byte_counter < $len) { | ||
254 | $char = $this->_next_char($this->_string, $byte_counter, true); | ||
255 | |||
256 | |||
257 | // language trigram detection | ||
258 | if ($this->_compile_trigram) { | ||
259 | if (!($b == ' ' && ($a == ' ' || $char == ' '))) { | ||
260 | if (!isset($this->_trigram[$a . $b . $char])) { | ||
261 | $this->_trigram[$a . $b . $char] = 1; | ||
262 | } else { | ||
263 | $this->_trigram[$a . $b . $char]++; | ||
264 | } | ||
265 | } | ||
266 | |||
267 | $a = $b; | ||
268 | $b = $char; | ||
269 | } | ||
270 | |||
271 | // unicode block detection | ||
272 | if ($this->_compile_unicode) { | ||
273 | if ($this->_unicode_skip_symbols | ||
274 | && strlen($char) == 1 | ||
275 | && ($char < 'A' || $char > 'z' | ||
276 | || ($char > 'Z' && $char < 'a')) | ||
277 | && $char != "'") { // does not skip the apostrophe | ||
278 | // since it's included in the language | ||
279 | // models | ||
280 | |||
281 | $skipped_count++; | ||
282 | continue; | ||
283 | } | ||
284 | |||
285 | // build an array of all the characters | ||
286 | if (isset($unicode_chars[$char])) { | ||
287 | $unicode_chars[$char]++; | ||
288 | } else { | ||
289 | $unicode_chars[$char] = 1; | ||
290 | } | ||
291 | } | ||
292 | |||
293 | // todo: add byte detection here | ||
294 | } | ||
295 | |||
296 | // unicode cleanup | ||
297 | if ($this->_compile_unicode) { | ||
298 | foreach ($unicode_chars as $utf8_char => $count) { | ||
299 | $search_result = $this->_unicode_block_name( | ||
300 | $this->_utf8char2unicode($utf8_char), $blocks, $block_count); | ||
301 | |||
302 | if ($search_result != -1) { | ||
303 | $block_name = $search_result[2]; | ||
304 | } else { | ||
305 | $block_name = '[Malformatted]'; | ||
306 | } | ||
307 | |||
308 | if (isset($this->_unicode_blocks[$block_name])) { | ||
309 | $this->_unicode_blocks[$block_name] += $count; | ||
310 | } else { | ||
311 | $this->_unicode_blocks[$block_name] = $count; | ||
312 | } | ||
313 | } | ||
314 | } | ||
315 | |||
316 | |||
317 | // trigram cleanup | ||
318 | if ($this->_compile_trigram) { | ||
319 | // pad the end | ||
320 | if ($b != ' ') { | ||
321 | if (!isset($this->_trigram["$a$b "])) { | ||
322 | $this->_trigram["$a$b "] = 1; | ||
323 | } else { | ||
324 | $this->_trigram["$a$b "]++; | ||
325 | } | ||
326 | } | ||
327 | |||
328 | // perl compatibility; Language::Guess does not pad the beginning | ||
329 | // kludge | ||
330 | if (isset($dropone)) { | ||
331 | if ($this->_trigram[$dropone] == 1) { | ||
332 | unset($this->_trigram[$dropone]); | ||
333 | } else { | ||
334 | $this->_trigram[$dropone]--; | ||
335 | } | ||
336 | } | ||
337 | |||
338 | if (!empty($this->_trigram)) { | ||
339 | $this->_trigram_ranks = $this->_arr_rank($this->_trigram); | ||
340 | } else { | ||
341 | $this->_trigram_ranks = array(); | ||
342 | } | ||
343 | } | ||
344 | } | ||
345 | } | ||
346 | |||
347 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file | ||