]> git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/libraries/language-detect/Parser.php
Updated polish translation
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / language-detect / Parser.php
1 <?php
2
3 /**
4 * This class represents a text sample to be parsed.
5 *
6 * @category Text
7 * @package Text_LanguageDetect
8 * @author Nicholas Pisarro
9 * @copyright 2006
10 * @license BSD
11 * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/
14 */
15
16 /**
17 * This class represents a text sample to be parsed.
18 *
19 * This separates the analysis of a text sample from the primary LanguageDetect
20 * class. After a new profile has been built, the data can be retrieved using
21 * the accessor functions.
22 *
23 * This class is intended to be used by the Text_LanguageDetect class, not
24 * end-users.
25 *
26 * @category Text
27 * @package Text_LanguageDetect
28 * @author Nicholas Pisarro
29 * @copyright 2006
30 * @license BSD
31 * @version release: 0.2.3
32 */
33 class Text_LanguageDetect_Parser extends Text_LanguageDetect
34 {
35 /**
36 * the piece of text being parsed
37 *
38 * @access private
39 * @var string
40 */
41 var $_string;
42
43 /**
44 * stores the trigram frequencies of the sample
45 *
46 * @access private
47 * @var string
48 */
49 var $_trigrams = array();
50
51 /**
52 * stores the trigram ranks of the sample
53 *
54 * @access private
55 * @var array
56 */
57 var $_trigram_ranks = array();
58
59 /**
60 * stores the unicode blocks of the sample
61 *
62 * @access private
63 * @var array
64 */
65 var $_unicode_blocks = array();
66
67 /**
68 * Whether the parser should compile the unicode ranges
69 *
70 * @access private
71 * @var bool
72 */
73 var $_compile_unicode = false;
74
75 /**
76 * Whether the parser should compile trigrams
77 *
78 * @access private
79 * @var bool
80 */
81 var $_compile_trigram = false;
82
83 /**
84 * Whether the trigram parser should pad the beginning of the string
85 *
86 * @access private
87 * @var bool
88 */
89 var $_trigram_pad_start = false;
90
91 /**
92 * Whether the unicode parser should skip non-alphabetical ascii chars
93 *
94 * @access private
95 * @var bool
96 */
97 var $_unicode_skip_symbols = true;
98
99 /**
100 * Constructor
101 *
102 * @access private
103 * @param string $string string to be parsed
104 */
105 function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
106 if (isset($db)) $this->_db_filename = $db;
107 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
108 $this->_string = $string;
109 }
110
111 /**
112 * Returns true if a string is suitable for parsing
113 *
114 * @static
115 * @access public
116 * @param string $str input string to test
117 * @return bool true if acceptable, false if not
118 */
119 function validateString($str) {
120 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
121 return true;
122 } else {
123 return false;
124 }
125 }
126
127 /**
128 * turn on/off trigram counting
129 *
130 * @access public
131 * @param bool $bool true for on, false for off
132 */
133 function prepareTrigram($bool = true)
134 {
135 $this->_compile_trigram = $bool;
136 }
137
138 /**
139 * turn on/off unicode block counting
140 *
141 * @access public
142 * @param bool $bool true for on, false for off
143 */
144 function prepareUnicode($bool = true)
145 {
146 $this->_compile_unicode = $bool;
147 }
148
149 /**
150 * turn on/off padding the beginning of the sample string
151 *
152 * @access public
153 * @param bool $bool true for on, false for off
154 */
155 function setPadStart($bool = true)
156 {
157 $this->_trigram_pad_start = $bool;
158 }
159
160 /**
161 * Should the unicode block counter skip non-alphabetical ascii chars?
162 *
163 * @access public
164 * @param bool $bool true for on, false for off
165 */
166 function setUnicodeSkipSymbols($bool = true)
167 {
168 $this->_unicode_skip_symbols = $bool;
169 }
170
171 /**
172 * Returns the trigram ranks for the text sample
173 *
174 * @access public
175 * @return array trigram ranks in the text sample
176 */
177 function &getTrigramRanks()
178 {
179 return $this->_trigram_ranks;
180 }
181
182 /**
183 * Return the trigram freqency table
184 *
185 * only used in testing to make sure the parser is working
186 *
187 * @access public
188 * @return array trigram freqencies in the text sample
189 */
190 function &getTrigramFreqs()
191 {
192 return $this->_trigram;
193 }
194
195 /**
196 * returns the array of unicode blocks
197 *
198 * @access public
199 * @return array unicode blocks in the text sample
200 */
201 function &getUnicodeBlocks()
202 {
203 return $this->_unicode_blocks;
204 }
205
206 /**
207 * Executes the parsing operation
208 *
209 * Be sure to call the set*() functions to set options and the
210 * prepare*() functions first to tell it what kind of data to compute
211 *
212 * Afterwards the get*() functions can be used to access the compiled
213 * information.
214 *
215 * @access public
216 */
217 function analyze()
218 {
219 $len = strlen($this->_string);
220 $byte_counter = 0;
221
222
223 // unicode startup
224 if ($this->_compile_unicode) {
225 $blocks =& $this->_read_unicode_block_db();
226
227 $block_count = count($blocks);
228
229 $skipped_count = 0;
230 $unicode_chars = array();
231 }
232
233 // trigram startup
234 if ($this->_compile_trigram) {
235 // initialize them as blank so the parser will skip the first two
236 // (since it skips trigrams with more than 2 contiguous spaces)
237 $a = ' ';
238 $b = ' ';
239
240 // kludge
241 // if it finds a valid trigram to start and the start pad option is
242 // off, then set a variable that will be used to reduce this
243 // trigram after parsing has finished
244 if (!$this->_trigram_pad_start) {
245 $a = $this->_next_char($this->_string, $byte_counter, true);
246
247 if ($a != ' ') {
248 $b = $this->_next_char($this->_string, $byte_counter, true);
249 $dropone = " $a$b";
250 }
251
252 $byte_counter = 0;
253 $a = ' ';
254 $b = ' ';
255 }
256 }
257
258 while ($byte_counter < $len) {
259 $char = $this->_next_char($this->_string, $byte_counter, true);
260
261
262 // language trigram detection
263 if ($this->_compile_trigram) {
264 if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
265 if (!isset($this->_trigram[$a . $b . $char])) {
266 $this->_trigram[$a . $b . $char] = 1;
267 } else {
268 $this->_trigram[$a . $b . $char]++;
269 }
270 }
271
272 $a = $b;
273 $b = $char;
274 }
275
276 // unicode block detection
277 if ($this->_compile_unicode) {
278 if ($this->_unicode_skip_symbols
279 && strlen($char) == 1
280 && ($char < 'A' || $char > 'z'
281 || ($char > 'Z' && $char < 'a'))
282 && $char != "'") { // does not skip the apostrophe
283 // since it's included in the language
284 // models
285
286 $skipped_count++;
287 continue;
288 }
289
290 // build an array of all the characters
291 if (isset($unicode_chars[$char])) {
292 $unicode_chars[$char]++;
293 } else {
294 $unicode_chars[$char] = 1;
295 }
296 }
297
298 // todo: add byte detection here
299 }
300
301 // unicode cleanup
302 if ($this->_compile_unicode) {
303 foreach ($unicode_chars as $utf8_char => $count) {
304 $search_result = $this->_unicode_block_name(
305 $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
306
307 if ($search_result != -1) {
308 $block_name = $search_result[2];
309 } else {
310 $block_name = '[Malformatted]';
311 }
312
313 if (isset($this->_unicode_blocks[$block_name])) {
314 $this->_unicode_blocks[$block_name] += $count;
315 } else {
316 $this->_unicode_blocks[$block_name] = $count;
317 }
318 }
319 }
320
321
322 // trigram cleanup
323 if ($this->_compile_trigram) {
324 // pad the end
325 if ($b != ' ') {
326 if (!isset($this->_trigram["$a$b "])) {
327 $this->_trigram["$a$b "] = 1;
328 } else {
329 $this->_trigram["$a$b "]++;
330 }
331 }
332
333 // perl compatibility; Language::Guess does not pad the beginning
334 // kludge
335 if (isset($dropone)) {
336 if ($this->_trigram[$dropone] == 1) {
337 unset($this->_trigram[$dropone]);
338 } else {
339 $this->_trigram[$dropone]--;
340 }
341 }
342
343 if (!empty($this->_trigram)) {
344 $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
345 } else {
346 $this->_trigram_ranks = array();
347 }
348 }
349 }
350 }
351
352 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
353
354 ?>