]>
Commit | Line | Data |
---|---|---|
a50583fb MR |
1 | <?php |
2 | ||
3 | /** | |
4 | * This class represents a text sample to be parsed. | |
5 | * | |
6 | * @category Text | |
7 | * @package Text_LanguageDetect | |
8 | * @author Nicholas Pisarro | |
9 | * @copyright 2006 | |
10 | * @license BSD | |
11 | * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ | |
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | |
13 | * @link http://langdetect.blogspot.com/ | |
14 | */ | |
15 | ||
16 | /** | |
17 | * This class represents a text sample to be parsed. | |
18 | * | |
19 | * This separates the analysis of a text sample from the primary LanguageDetect | |
20 | * class. After a new profile has been built, the data can be retrieved using | |
21 | * the accessor functions. | |
22 | * | |
23 | * This class is intended to be used by the Text_LanguageDetect class, not | |
24 | * end-users. | |
25 | * | |
26 | * @category Text | |
27 | * @package Text_LanguageDetect | |
28 | * @author Nicholas Pisarro | |
29 | * @copyright 2006 | |
30 | * @license BSD | |
31 | * @version release: 0.3.0 | |
32 | */ | |
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | |
34 | { | |
35 | /** | |
36 | * the piece of text being parsed | |
37 | * | |
38 | * @access private | |
39 | * @var string | |
40 | */ | |
41 | var $_string; | |
42 | ||
43 | /** | |
44 | * stores the trigram frequencies of the sample | |
45 | * | |
46 | * @access private | |
47 | * @var string | |
48 | */ | |
49 | var $_trigrams = array(); | |
50 | ||
51 | /** | |
52 | * stores the trigram ranks of the sample | |
53 | * | |
54 | * @access private | |
55 | * @var array | |
56 | */ | |
57 | var $_trigram_ranks = array(); | |
58 | ||
59 | /** | |
60 | * stores the unicode blocks of the sample | |
61 | * | |
62 | * @access private | |
63 | * @var array | |
64 | */ | |
65 | var $_unicode_blocks = array(); | |
66 | ||
67 | /** | |
68 | * Whether the parser should compile the unicode ranges | |
69 | * | |
70 | * @access private | |
71 | * @var bool | |
72 | */ | |
73 | var $_compile_unicode = false; | |
74 | ||
75 | /** | |
76 | * Whether the parser should compile trigrams | |
77 | * | |
78 | * @access private | |
79 | * @var bool | |
80 | */ | |
81 | var $_compile_trigram = false; | |
82 | ||
83 | /** | |
84 | * Whether the trigram parser should pad the beginning of the string | |
85 | * | |
86 | * @access private | |
87 | * @var bool | |
88 | */ | |
89 | var $_trigram_pad_start = false; | |
90 | ||
91 | /** | |
92 | * Whether the unicode parser should skip non-alphabetical ascii chars | |
93 | * | |
94 | * @access private | |
95 | * @var bool | |
96 | */ | |
97 | var $_unicode_skip_symbols = true; | |
98 | ||
99 | /** | |
100 | * Constructor | |
101 | * | |
102 | * @access private | |
103 | * @param string $string string to be parsed | |
104 | */ | |
105 | function Text_LanguageDetect_Parser($string) { | |
106 | $this->_string = $string; | |
107 | } | |
108 | ||
109 | /** | |
110 | * Returns true if a string is suitable for parsing | |
111 | * | |
112 | * @param string $str input string to test | |
113 | * @return bool true if acceptable, false if not | |
114 | */ | |
115 | public static function validateString($str) { | |
116 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | |
117 | return true; | |
118 | } else { | |
119 | return false; | |
120 | } | |
121 | } | |
122 | ||
123 | /** | |
124 | * turn on/off trigram counting | |
125 | * | |
126 | * @access public | |
127 | * @param bool $bool true for on, false for off | |
128 | */ | |
129 | function prepareTrigram($bool = true) | |
130 | { | |
131 | $this->_compile_trigram = $bool; | |
132 | } | |
133 | ||
134 | /** | |
135 | * turn on/off unicode block counting | |
136 | * | |
137 | * @access public | |
138 | * @param bool $bool true for on, false for off | |
139 | */ | |
140 | function prepareUnicode($bool = true) | |
141 | { | |
142 | $this->_compile_unicode = $bool; | |
143 | } | |
144 | ||
145 | /** | |
146 | * turn on/off padding the beginning of the sample string | |
147 | * | |
148 | * @access public | |
149 | * @param bool $bool true for on, false for off | |
150 | */ | |
151 | function setPadStart($bool = true) | |
152 | { | |
153 | $this->_trigram_pad_start = $bool; | |
154 | } | |
155 | ||
156 | /** | |
157 | * Should the unicode block counter skip non-alphabetical ascii chars? | |
158 | * | |
159 | * @access public | |
160 | * @param bool $bool true for on, false for off | |
161 | */ | |
162 | function setUnicodeSkipSymbols($bool = true) | |
163 | { | |
164 | $this->_unicode_skip_symbols = $bool; | |
165 | } | |
166 | ||
167 | /** | |
168 | * Returns the trigram ranks for the text sample | |
169 | * | |
170 | * @access public | |
171 | * @return array trigram ranks in the text sample | |
172 | */ | |
173 | function &getTrigramRanks() | |
174 | { | |
175 | return $this->_trigram_ranks; | |
176 | } | |
177 | ||
178 | /** | |
179 | * Return the trigram freqency table | |
180 | * | |
181 | * only used in testing to make sure the parser is working | |
182 | * | |
183 | * @access public | |
184 | * @return array trigram freqencies in the text sample | |
185 | */ | |
186 | function &getTrigramFreqs() | |
187 | { | |
188 | return $this->_trigram; | |
189 | } | |
190 | ||
191 | /** | |
192 | * returns the array of unicode blocks | |
193 | * | |
194 | * @access public | |
195 | * @return array unicode blocks in the text sample | |
196 | */ | |
197 | function &getUnicodeBlocks() | |
198 | { | |
199 | return $this->_unicode_blocks; | |
200 | } | |
201 | ||
202 | /** | |
203 | * Executes the parsing operation | |
204 | * | |
205 | * Be sure to call the set*() functions to set options and the | |
206 | * prepare*() functions first to tell it what kind of data to compute | |
207 | * | |
208 | * Afterwards the get*() functions can be used to access the compiled | |
209 | * information. | |
210 | * | |
211 | * @access public | |
212 | */ | |
213 | function analyze() | |
214 | { | |
215 | $len = strlen($this->_string); | |
216 | $byte_counter = 0; | |
217 | ||
218 | ||
219 | // unicode startup | |
220 | if ($this->_compile_unicode) { | |
221 | $blocks = $this->_read_unicode_block_db(); | |
222 | $block_count = count($blocks); | |
223 | ||
224 | $skipped_count = 0; | |
225 | $unicode_chars = array(); | |
226 | } | |
227 | ||
228 | // trigram startup | |
229 | if ($this->_compile_trigram) { | |
230 | // initialize them as blank so the parser will skip the first two | |
231 | // (since it skips trigrams with more than 2 contiguous spaces) | |
232 | $a = ' '; | |
233 | $b = ' '; | |
234 | ||
235 | // kludge | |
236 | // if it finds a valid trigram to start and the start pad option is | |
237 | // off, then set a variable that will be used to reduce this | |
238 | // trigram after parsing has finished | |
239 | if (!$this->_trigram_pad_start) { | |
240 | $a = $this->_next_char($this->_string, $byte_counter, true); | |
241 | ||
242 | if ($a != ' ') { | |
243 | $b = $this->_next_char($this->_string, $byte_counter, true); | |
244 | $dropone = " $a$b"; | |
245 | } | |
246 | ||
247 | $byte_counter = 0; | |
248 | $a = ' '; | |
249 | $b = ' '; | |
250 | } | |
251 | } | |
252 | ||
253 | while ($byte_counter < $len) { | |
254 | $char = $this->_next_char($this->_string, $byte_counter, true); | |
255 | ||
256 | ||
257 | // language trigram detection | |
258 | if ($this->_compile_trigram) { | |
259 | if (!($b == ' ' && ($a == ' ' || $char == ' '))) { | |
260 | if (!isset($this->_trigram[$a . $b . $char])) { | |
261 | $this->_trigram[$a . $b . $char] = 1; | |
262 | } else { | |
263 | $this->_trigram[$a . $b . $char]++; | |
264 | } | |
265 | } | |
266 | ||
267 | $a = $b; | |
268 | $b = $char; | |
269 | } | |
270 | ||
271 | // unicode block detection | |
272 | if ($this->_compile_unicode) { | |
273 | if ($this->_unicode_skip_symbols | |
274 | && strlen($char) == 1 | |
275 | && ($char < 'A' || $char > 'z' | |
276 | || ($char > 'Z' && $char < 'a')) | |
277 | && $char != "'") { // does not skip the apostrophe | |
278 | // since it's included in the language | |
279 | // models | |
280 | ||
281 | $skipped_count++; | |
282 | continue; | |
283 | } | |
284 | ||
285 | // build an array of all the characters | |
286 | if (isset($unicode_chars[$char])) { | |
287 | $unicode_chars[$char]++; | |
288 | } else { | |
289 | $unicode_chars[$char] = 1; | |
290 | } | |
291 | } | |
292 | ||
293 | // todo: add byte detection here | |
294 | } | |
295 | ||
296 | // unicode cleanup | |
297 | if ($this->_compile_unicode) { | |
298 | foreach ($unicode_chars as $utf8_char => $count) { | |
299 | $search_result = $this->_unicode_block_name( | |
300 | $this->_utf8char2unicode($utf8_char), $blocks, $block_count); | |
301 | ||
302 | if ($search_result != -1) { | |
303 | $block_name = $search_result[2]; | |
304 | } else { | |
305 | $block_name = '[Malformatted]'; | |
306 | } | |
307 | ||
308 | if (isset($this->_unicode_blocks[$block_name])) { | |
309 | $this->_unicode_blocks[$block_name] += $count; | |
310 | } else { | |
311 | $this->_unicode_blocks[$block_name] = $count; | |
312 | } | |
313 | } | |
314 | } | |
315 | ||
316 | ||
317 | // trigram cleanup | |
318 | if ($this->_compile_trigram) { | |
319 | // pad the end | |
320 | if ($b != ' ') { | |
321 | if (!isset($this->_trigram["$a$b "])) { | |
322 | $this->_trigram["$a$b "] = 1; | |
323 | } else { | |
324 | $this->_trigram["$a$b "]++; | |
325 | } | |
326 | } | |
327 | ||
328 | // perl compatibility; Language::Guess does not pad the beginning | |
329 | // kludge | |
330 | if (isset($dropone)) { | |
331 | if ($this->_trigram[$dropone] == 1) { | |
332 | unset($this->_trigram[$dropone]); | |
333 | } else { | |
334 | $this->_trigram[$dropone]--; | |
335 | } | |
336 | } | |
337 | ||
338 | if (!empty($this->_trigram)) { | |
339 | $this->_trigram_ranks = $this->_arr_rank($this->_trigram); | |
340 | } else { | |
341 | $this->_trigram_ranks = array(); | |
342 | } | |
343 | } | |
344 | } | |
345 | } | |
346 | ||
347 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ |