diff options
author | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-23 19:25:48 +0300 |
---|---|---|
committer | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-23 19:25:48 +0300 |
commit | d18ff7d9565f982bc15c5930123992d44614e1e2 (patch) | |
tree | c72ba88c10f9b31d513d1e82b8fa9eeb34cb4b74 /inc/3rdparty/libraries/language-detect | |
parent | 3ec62cf95ab4436923d4c665fad7aef226cbb822 (diff) | |
download | wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.tar.gz wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.tar.zst wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.zip |
two small unimportant forgotten changes to 3.2 version of full-text-rss, issue #694
Diffstat (limited to 'inc/3rdparty/libraries/language-detect')
-rw-r--r-- | inc/3rdparty/libraries/language-detect/Parser.php | 354 |
1 files changed, 0 insertions, 354 deletions
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/Parser.php deleted file mode 100644 index 7f15fa98..00000000 --- a/inc/3rdparty/libraries/language-detect/Parser.php +++ /dev/null | |||
@@ -1,354 +0,0 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * This class represents a text sample to be parsed. | ||
5 | * | ||
6 | * @category Text | ||
7 | * @package Text_LanguageDetect | ||
8 | * @author Nicholas Pisarro | ||
9 | * @copyright 2006 | ||
10 | * @license BSD | ||
11 | * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ | ||
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
13 | * @link http://langdetect.blogspot.com/ | ||
14 | */ | ||
15 | |||
16 | /** | ||
17 | * This class represents a text sample to be parsed. | ||
18 | * | ||
19 | * This separates the analysis of a text sample from the primary LanguageDetect | ||
20 | * class. After a new profile has been built, the data can be retrieved using | ||
21 | * the accessor functions. | ||
22 | * | ||
23 | * This class is intended to be used by the Text_LanguageDetect class, not | ||
24 | * end-users. | ||
25 | * | ||
26 | * @category Text | ||
27 | * @package Text_LanguageDetect | ||
28 | * @author Nicholas Pisarro | ||
29 | * @copyright 2006 | ||
30 | * @license BSD | ||
31 | * @version release: 0.2.3 | ||
32 | */ | ||
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | ||
34 | { | ||
35 | /** | ||
36 | * the piece of text being parsed | ||
37 | * | ||
38 | * @access private | ||
39 | * @var string | ||
40 | */ | ||
41 | var $_string; | ||
42 | |||
43 | /** | ||
44 | * stores the trigram frequencies of the sample | ||
45 | * | ||
46 | * @access private | ||
47 | * @var string | ||
48 | */ | ||
49 | var $_trigrams = array(); | ||
50 | |||
51 | /** | ||
52 | * stores the trigram ranks of the sample | ||
53 | * | ||
54 | * @access private | ||
55 | * @var array | ||
56 | */ | ||
57 | var $_trigram_ranks = array(); | ||
58 | |||
59 | /** | ||
60 | * stores the unicode blocks of the sample | ||
61 | * | ||
62 | * @access private | ||
63 | * @var array | ||
64 | */ | ||
65 | var $_unicode_blocks = array(); | ||
66 | |||
67 | /** | ||
68 | * Whether the parser should compile the unicode ranges | ||
69 | * | ||
70 | * @access private | ||
71 | * @var bool | ||
72 | */ | ||
73 | var $_compile_unicode = false; | ||
74 | |||
75 | /** | ||
76 | * Whether the parser should compile trigrams | ||
77 | * | ||
78 | * @access private | ||
79 | * @var bool | ||
80 | */ | ||
81 | var $_compile_trigram = false; | ||
82 | |||
83 | /** | ||
84 | * Whether the trigram parser should pad the beginning of the string | ||
85 | * | ||
86 | * @access private | ||
87 | * @var bool | ||
88 | */ | ||
89 | var $_trigram_pad_start = false; | ||
90 | |||
91 | /** | ||
92 | * Whether the unicode parser should skip non-alphabetical ascii chars | ||
93 | * | ||
94 | * @access private | ||
95 | * @var bool | ||
96 | */ | ||
97 | var $_unicode_skip_symbols = true; | ||
98 | |||
99 | /** | ||
100 | * Constructor | ||
101 | * | ||
102 | * @access private | ||
103 | * @param string $string string to be parsed | ||
104 | */ | ||
105 | function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { | ||
106 | if (isset($db)) $this->_db_filename = $db; | ||
107 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
108 | $this->_string = $string; | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * Returns true if a string is suitable for parsing | ||
113 | * | ||
114 | * @static | ||
115 | * @access public | ||
116 | * @param string $str input string to test | ||
117 | * @return bool true if acceptable, false if not | ||
118 | */ | ||
119 | function validateString($str) { | ||
120 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | ||
121 | return true; | ||
122 | } else { | ||
123 | return false; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | /** | ||
128 | * turn on/off trigram counting | ||
129 | * | ||
130 | * @access public | ||
131 | * @param bool $bool true for on, false for off | ||
132 | */ | ||
133 | function prepareTrigram($bool = true) | ||
134 | { | ||
135 | $this->_compile_trigram = $bool; | ||
136 | } | ||
137 | |||
138 | /** | ||
139 | * turn on/off unicode block counting | ||
140 | * | ||
141 | * @access public | ||
142 | * @param bool $bool true for on, false for off | ||
143 | */ | ||
144 | function prepareUnicode($bool = true) | ||
145 | { | ||
146 | $this->_compile_unicode = $bool; | ||
147 | } | ||
148 | |||
149 | /** | ||
150 | * turn on/off padding the beginning of the sample string | ||
151 | * | ||
152 | * @access public | ||
153 | * @param bool $bool true for on, false for off | ||
154 | */ | ||
155 | function setPadStart($bool = true) | ||
156 | { | ||
157 | $this->_trigram_pad_start = $bool; | ||
158 | } | ||
159 | |||
160 | /** | ||
161 | * Should the unicode block counter skip non-alphabetical ascii chars? | ||
162 | * | ||
163 | * @access public | ||
164 | * @param bool $bool true for on, false for off | ||
165 | */ | ||
166 | function setUnicodeSkipSymbols($bool = true) | ||
167 | { | ||
168 | $this->_unicode_skip_symbols = $bool; | ||
169 | } | ||
170 | |||
171 | /** | ||
172 | * Returns the trigram ranks for the text sample | ||
173 | * | ||
174 | * @access public | ||
175 | * @return array trigram ranks in the text sample | ||
176 | */ | ||
177 | function &getTrigramRanks() | ||
178 | { | ||
179 | return $this->_trigram_ranks; | ||
180 | } | ||
181 | |||
182 | /** | ||
183 | * Return the trigram freqency table | ||
184 | * | ||
185 | * only used in testing to make sure the parser is working | ||
186 | * | ||
187 | * @access public | ||
188 | * @return array trigram freqencies in the text sample | ||
189 | */ | ||
190 | function &getTrigramFreqs() | ||
191 | { | ||
192 | return $this->_trigram; | ||
193 | } | ||
194 | |||
195 | /** | ||
196 | * returns the array of unicode blocks | ||
197 | * | ||
198 | * @access public | ||
199 | * @return array unicode blocks in the text sample | ||
200 | */ | ||
201 | function &getUnicodeBlocks() | ||
202 | { | ||
203 | return $this->_unicode_blocks; | ||
204 | } | ||
205 | |||
206 | /** | ||
207 | * Executes the parsing operation | ||
208 | * | ||
209 | * Be sure to call the set*() functions to set options and the | ||
210 | * prepare*() functions first to tell it what kind of data to compute | ||
211 | * | ||
212 | * Afterwards the get*() functions can be used to access the compiled | ||
213 | * information. | ||
214 | * | ||
215 | * @access public | ||
216 | */ | ||
217 | function analyze() | ||
218 | { | ||
219 | $len = strlen($this->_string); | ||
220 | $byte_counter = 0; | ||
221 | |||
222 | |||
223 | // unicode startup | ||
224 | if ($this->_compile_unicode) { | ||
225 | $blocks =& $this->_read_unicode_block_db(); | ||
226 | |||
227 | $block_count = count($blocks); | ||
228 | |||
229 | $skipped_count = 0; | ||
230 | $unicode_chars = array(); | ||
231 | } | ||
232 | |||
233 | // trigram startup | ||
234 | if ($this->_compile_trigram) { | ||
235 | // initialize them as blank so the parser will skip the first two | ||
236 | // (since it skips trigrams with more than 2 contiguous spaces) | ||
237 | $a = ' '; | ||
238 | $b = ' '; | ||
239 | |||
240 | // kludge | ||
241 | // if it finds a valid trigram to start and the start pad option is | ||
242 | // off, then set a variable that will be used to reduce this | ||
243 | // trigram after parsing has finished | ||
244 | if (!$this->_trigram_pad_start) { | ||
245 | $a = $this->_next_char($this->_string, $byte_counter, true); | ||
246 | |||
247 | if ($a != ' ') { | ||
248 | $b = $this->_next_char($this->_string, $byte_counter, true); | ||
249 | $dropone = " $a$b"; | ||
250 | } | ||
251 | |||
252 | $byte_counter = 0; | ||
253 | $a = ' '; | ||
254 | $b = ' '; | ||
255 | } | ||
256 | } | ||
257 | |||
258 | while ($byte_counter < $len) { | ||
259 | $char = $this->_next_char($this->_string, $byte_counter, true); | ||
260 | |||
261 | |||
262 | // language trigram detection | ||
263 | if ($this->_compile_trigram) { | ||
264 | if (!($b == ' ' && ($a == ' ' || $char == ' '))) { | ||
265 | if (!isset($this->_trigram[$a . $b . $char])) { | ||
266 | $this->_trigram[$a . $b . $char] = 1; | ||
267 | } else { | ||
268 | $this->_trigram[$a . $b . $char]++; | ||
269 | } | ||
270 | } | ||
271 | |||
272 | $a = $b; | ||
273 | $b = $char; | ||
274 | } | ||
275 | |||
276 | // unicode block detection | ||
277 | if ($this->_compile_unicode) { | ||
278 | if ($this->_unicode_skip_symbols | ||
279 | && strlen($char) == 1 | ||
280 | && ($char < 'A' || $char > 'z' | ||
281 | || ($char > 'Z' && $char < 'a')) | ||
282 | && $char != "'") { // does not skip the apostrophe | ||
283 | // since it's included in the language | ||
284 | // models | ||
285 | |||
286 | $skipped_count++; | ||
287 | continue; | ||
288 | } | ||
289 | |||
290 | // build an array of all the characters | ||
291 | if (isset($unicode_chars[$char])) { | ||
292 | $unicode_chars[$char]++; | ||
293 | } else { | ||
294 | $unicode_chars[$char] = 1; | ||
295 | } | ||
296 | } | ||
297 | |||
298 | // todo: add byte detection here | ||
299 | } | ||
300 | |||
301 | // unicode cleanup | ||
302 | if ($this->_compile_unicode) { | ||
303 | foreach ($unicode_chars as $utf8_char => $count) { | ||
304 | $search_result = $this->_unicode_block_name( | ||
305 | $this->_utf8char2unicode($utf8_char), $blocks, $block_count); | ||
306 | |||
307 | if ($search_result != -1) { | ||
308 | $block_name = $search_result[2]; | ||
309 | } else { | ||
310 | $block_name = '[Malformatted]'; | ||
311 | } | ||
312 | |||
313 | if (isset($this->_unicode_blocks[$block_name])) { | ||
314 | $this->_unicode_blocks[$block_name] += $count; | ||
315 | } else { | ||
316 | $this->_unicode_blocks[$block_name] = $count; | ||
317 | } | ||
318 | } | ||
319 | } | ||
320 | |||
321 | |||
322 | // trigram cleanup | ||
323 | if ($this->_compile_trigram) { | ||
324 | // pad the end | ||
325 | if ($b != ' ') { | ||
326 | if (!isset($this->_trigram["$a$b "])) { | ||
327 | $this->_trigram["$a$b "] = 1; | ||
328 | } else { | ||
329 | $this->_trigram["$a$b "]++; | ||
330 | } | ||
331 | } | ||
332 | |||
333 | // perl compatibility; Language::Guess does not pad the beginning | ||
334 | // kludge | ||
335 | if (isset($dropone)) { | ||
336 | if ($this->_trigram[$dropone] == 1) { | ||
337 | unset($this->_trigram[$dropone]); | ||
338 | } else { | ||
339 | $this->_trigram[$dropone]--; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | if (!empty($this->_trigram)) { | ||
344 | $this->_trigram_ranks = $this->_arr_rank($this->_trigram); | ||
345 | } else { | ||
346 | $this->_trigram_ranks = array(); | ||
347 | } | ||
348 | } | ||
349 | } | ||
350 | } | ||
351 | |||
352 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | ||
353 | |||
354 | ?> | ||