]>
Commit | Line | Data |
---|---|---|
42c80841 NL |
1 | <?php |
2 | ||
3 | /** | |
4 | * This class represents a text sample to be parsed. | |
5 | * | |
6 | * @category Text | |
7 | * @package Text_LanguageDetect | |
8 | * @author Nicholas Pisarro | |
9 | * @copyright 2006 | |
10 | * @license BSD | |
11 | * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ | |
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
13 | * @link http://langdetect.blogspot.com/ | |
14 | */ | |
15 | ||
16 | /** | |
17 | * This class represents a text sample to be parsed. | |
18 | * | |
19 | * This separates the analysis of a text sample from the primary LanguageDetect | |
20 | * class. After a new profile has been built, the data can be retrieved using | |
21 | * the accessor functions. | |
22 | * | |
23 | * This class is intended to be used by the Text_LanguageDetect class, not | |
24 | * end-users. | |
25 | * | |
26 | * @category Text | |
27 | * @package Text_LanguageDetect | |
28 | * @author Nicholas Pisarro | |
29 | * @copyright 2006 | |
30 | * @license BSD | |
31 | * @version release: 0.2.3 | |
32 | */ | |
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | |
34 | { | |
35 | /** | |
36 | * the piece of text being parsed | |
37 | * | |
38 | * @access private | |
39 | * @var string | |
40 | */ | |
41 | var $_string; | |
42 | ||
43 | /** | |
44 | * stores the trigram frequencies of the sample | |
45 | * | |
46 | * @access private | |
47 | * @var string | |
48 | */ | |
49 | var $_trigrams = array(); | |
50 | ||
51 | /** | |
52 | * stores the trigram ranks of the sample | |
53 | * | |
54 | * @access private | |
55 | * @var array | |
56 | */ | |
57 | var $_trigram_ranks = array(); | |
58 | ||
59 | /** | |
60 | * stores the unicode blocks of the sample | |
61 | * | |
62 | * @access private | |
63 | * @var array | |
64 | */ | |
65 | var $_unicode_blocks = array(); | |
66 | ||
67 | /** | |
68 | * Whether the parser should compile the unicode ranges | |
69 | * | |
70 | * @access private | |
71 | * @var bool | |
72 | */ | |
73 | var $_compile_unicode = false; | |
74 | ||
75 | /** | |
76 | * Whether the parser should compile trigrams | |
77 | * | |
78 | * @access private | |
79 | * @var bool | |
80 | */ | |
81 | var $_compile_trigram = false; | |
82 | ||
83 | /** | |
84 | * Whether the trigram parser should pad the beginning of the string | |
85 | * | |
86 | * @access private | |
87 | * @var bool | |
88 | */ | |
89 | var $_trigram_pad_start = false; | |
90 | ||
91 | /** | |
92 | * Whether the unicode parser should skip non-alphabetical ascii chars | |
93 | * | |
94 | * @access private | |
95 | * @var bool | |
96 | */ | |
97 | var $_unicode_skip_symbols = true; | |
98 | ||
99 | /** | |
100 | * Constructor | |
101 | * | |
102 | * @access private | |
103 | * @param string $string string to be parsed | |
104 | */ | |
105 | function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { | |
106 | if (isset($db)) $this->_db_filename = $db; | |
107 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | |
108 | $this->_string = $string; | |
109 | } | |
110 | ||
111 | /** | |
112 | * Returns true if a string is suitable for parsing | |
113 | * | |
114 | * @static | |
115 | * @access public | |
116 | * @param string $str input string to test | |
117 | * @return bool true if acceptable, false if not | |
118 | */ | |
119 | function validateString($str) { | |
120 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | |
121 | return true; | |
122 | } else { | |
123 | return false; | |
124 | } | |
125 | } | |
126 | ||
127 | /** | |
128 | * turn on/off trigram counting | |
129 | * | |
130 | * @access public | |
131 | * @param bool $bool true for on, false for off | |
132 | */ | |
133 | function prepareTrigram($bool = true) | |
134 | { | |
135 | $this->_compile_trigram = $bool; | |
136 | } | |
137 | ||
138 | /** | |
139 | * turn on/off unicode block counting | |
140 | * | |
141 | * @access public | |
142 | * @param bool $bool true for on, false for off | |
143 | */ | |
144 | function prepareUnicode($bool = true) | |
145 | { | |
146 | $this->_compile_unicode = $bool; | |
147 | } | |
148 | ||
149 | /** | |
150 | * turn on/off padding the beginning of the sample string | |
151 | * | |
152 | * @access public | |
153 | * @param bool $bool true for on, false for off | |
154 | */ | |
155 | function setPadStart($bool = true) | |
156 | { | |
157 | $this->_trigram_pad_start = $bool; | |
158 | } | |
159 | ||
160 | /** | |
161 | * Should the unicode block counter skip non-alphabetical ascii chars? | |
162 | * | |
163 | * @access public | |
164 | * @param bool $bool true for on, false for off | |
165 | */ | |
166 | function setUnicodeSkipSymbols($bool = true) | |
167 | { | |
168 | $this->_unicode_skip_symbols = $bool; | |
169 | } | |
170 | ||
171 | /** | |
172 | * Returns the trigram ranks for the text sample | |
173 | * | |
174 | * @access public | |
175 | * @return array trigram ranks in the text sample | |
176 | */ | |
177 | function &getTrigramRanks() | |
178 | { | |
179 | return $this->_trigram_ranks; | |
180 | } | |
181 | ||
182 | /** | |
183 | * Return the trigram freqency table | |
184 | * | |
185 | * only used in testing to make sure the parser is working | |
186 | * | |
187 | * @access public | |
188 | * @return array trigram freqencies in the text sample | |
189 | */ | |
190 | function &getTrigramFreqs() | |
191 | { | |
192 | return $this->_trigram; | |
193 | } | |
194 | ||
195 | /** | |
196 | * returns the array of unicode blocks | |
197 | * | |
198 | * @access public | |
199 | * @return array unicode blocks in the text sample | |
200 | */ | |
201 | function &getUnicodeBlocks() | |
202 | { | |
203 | return $this->_unicode_blocks; | |
204 | } | |
205 | ||
206 | /** | |
207 | * Executes the parsing operation | |
208 | * | |
209 | * Be sure to call the set*() functions to set options and the | |
210 | * prepare*() functions first to tell it what kind of data to compute | |
211 | * | |
212 | * Afterwards the get*() functions can be used to access the compiled | |
213 | * information. | |
214 | * | |
215 | * @access public | |
216 | */ | |
217 | function analyze() | |
218 | { | |
219 | $len = strlen($this->_string); | |
220 | $byte_counter = 0; | |
221 | ||
222 | ||
223 | // unicode startup | |
224 | if ($this->_compile_unicode) { | |
225 | $blocks =& $this->_read_unicode_block_db(); | |
226 | ||
227 | $block_count = count($blocks); | |
228 | ||
229 | $skipped_count = 0; | |
230 | $unicode_chars = array(); | |
231 | } | |
232 | ||
233 | // trigram startup | |
234 | if ($this->_compile_trigram) { | |
235 | // initialize them as blank so the parser will skip the first two | |
236 | // (since it skips trigrams with more than 2 contiguous spaces) | |
237 | $a = ' '; | |
238 | $b = ' '; | |
239 | ||
240 | // kludge | |
241 | // if it finds a valid trigram to start and the start pad option is | |
242 | // off, then set a variable that will be used to reduce this | |
243 | // trigram after parsing has finished | |
244 | if (!$this->_trigram_pad_start) { | |
245 | $a = $this->_next_char($this->_string, $byte_counter, true); | |
246 | ||
247 | if ($a != ' ') { | |
248 | $b = $this->_next_char($this->_string, $byte_counter, true); | |
249 | $dropone = " $a$b"; | |
250 | } | |
251 | ||
252 | $byte_counter = 0; | |
253 | $a = ' '; | |
254 | $b = ' '; | |
255 | } | |
256 | } | |
257 | ||
258 | while ($byte_counter < $len) { | |
259 | $char = $this->_next_char($this->_string, $byte_counter, true); | |
260 | ||
261 | ||
262 | // language trigram detection | |
263 | if ($this->_compile_trigram) { | |
264 | if (!($b == ' ' && ($a == ' ' || $char == ' '))) { | |
265 | if (!isset($this->_trigram[$a . $b . $char])) { | |
266 | $this->_trigram[$a . $b . $char] = 1; | |
267 | } else { | |
268 | $this->_trigram[$a . $b . $char]++; | |
269 | } | |
270 | } | |
271 | ||
272 | $a = $b; | |
273 | $b = $char; | |
274 | } | |
275 | ||
276 | // unicode block detection | |
277 | if ($this->_compile_unicode) { | |
278 | if ($this->_unicode_skip_symbols | |
279 | && strlen($char) == 1 | |
280 | && ($char < 'A' || $char > 'z' | |
281 | || ($char > 'Z' && $char < 'a')) | |
282 | && $char != "'") { // does not skip the apostrophe | |
283 | // since it's included in the language | |
284 | // models | |
285 | ||
286 | $skipped_count++; | |
287 | continue; | |
288 | } | |
289 | ||
290 | // build an array of all the characters | |
291 | if (isset($unicode_chars[$char])) { | |
292 | $unicode_chars[$char]++; | |
293 | } else { | |
294 | $unicode_chars[$char] = 1; | |
295 | } | |
296 | } | |
297 | ||
298 | // todo: add byte detection here | |
299 | } | |
300 | ||
301 | // unicode cleanup | |
302 | if ($this->_compile_unicode) { | |
303 | foreach ($unicode_chars as $utf8_char => $count) { | |
304 | $search_result = $this->_unicode_block_name( | |
305 | $this->_utf8char2unicode($utf8_char), $blocks, $block_count); | |
306 | ||
307 | if ($search_result != -1) { | |
308 | $block_name = $search_result[2]; | |
309 | } else { | |
310 | $block_name = '[Malformatted]'; | |
311 | } | |
312 | ||
313 | if (isset($this->_unicode_blocks[$block_name])) { | |
314 | $this->_unicode_blocks[$block_name] += $count; | |
315 | } else { | |
316 | $this->_unicode_blocks[$block_name] = $count; | |
317 | } | |
318 | } | |
319 | } | |
320 | ||
321 | ||
322 | // trigram cleanup | |
323 | if ($this->_compile_trigram) { | |
324 | // pad the end | |
325 | if ($b != ' ') { | |
326 | if (!isset($this->_trigram["$a$b "])) { | |
327 | $this->_trigram["$a$b "] = 1; | |
328 | } else { | |
329 | $this->_trigram["$a$b "]++; | |
330 | } | |
331 | } | |
332 | ||
333 | // perl compatibility; Language::Guess does not pad the beginning | |
334 | // kludge | |
335 | if (isset($dropone)) { | |
336 | if ($this->_trigram[$dropone] == 1) { | |
337 | unset($this->_trigram[$dropone]); | |
338 | } else { | |
339 | $this->_trigram[$dropone]--; | |
340 | } | |
341 | } | |
342 | ||
343 | if (!empty($this->_trigram)) { | |
344 | $this->_trigram_ranks = $this->_arr_rank($this->_trigram); | |
345 | } else { | |
346 | $this->_trigram_ranks = array(); | |
347 | } | |
348 | } | |
349 | } | |
350 | } | |
351 | ||
352 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | |
353 | ||
354 | ?> |