[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / language-detect / LanguageDetect / Parser.php

<?php

/**
 * This class represents a text sample to be parsed.
 *
 * @category    Text
 * @package     Text_LanguageDetect
 * @author      Nicholas Pisarro
 * @copyright   2006
 * @license     BSD
 * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
 * @link        http://pear.php.net/package/Text_LanguageDetect/
 * @link        http://langdetect.blogspot.com/
 */

/**
 * This class represents a text sample to be parsed.
 *
 * This separates the analysis of a text sample from the primary LanguageDetect
 * class. After a new profile has been built, the data can be retrieved using
 * the accessor functions.
 *
 * This class is intended to be used by the Text_LanguageDetect class, not 
 * end-users.
 *
 * @category    Text
 * @package     Text_LanguageDetect
 * @author      Nicholas Pisarro
 * @copyright   2006
 * @license     BSD
 * @version     release: 0.3.0
 */
class Text_LanguageDetect_Parser extends Text_LanguageDetect
{
    /**
     * the piece of text being parsed
     *
     * @access  private
     * @var     string
     */
    var $_string;

    /**
     * stores the trigram frequencies of the sample
     *
     * @access  private
     * @var     string
     */
    var $_trigrams = array();

    /**
     * stores the trigram ranks of the sample
     *
     * @access  private
     * @var     array
     */
    var $_trigram_ranks = array();

    /**
     * stores the unicode blocks of the sample
     *
     * @access  private
     * @var     array
     */
    var $_unicode_blocks = array();
    
    /**
     * Whether the parser should compile the unicode ranges
     * 
     * @access  private
     * @var     bool
     */
    var $_compile_unicode = false;

    /**
     * Whether the parser should compile trigrams
     *
     * @access  private
     * @var     bool
     */
    var $_compile_trigram = false;

    /**
     * Whether the trigram parser should pad the beginning of the string
     *
     * @access  private
     * @var     bool
     */
    var $_trigram_pad_start = false;

    /**
     * Whether the unicode parser should skip non-alphabetical ascii chars
     *
     * @access  private
     * @var     bool
     */
    var $_unicode_skip_symbols = true;

    /**
     * Constructor
     *
     * @access  private
     * @param   string  $string     string to be parsed
     */
    function Text_LanguageDetect_Parser($string) {
        $this->_string = $string;
    }

    /**
     * Returns true if a string is suitable for parsing
     *
     * @param   string  $str    input string to test
     * @return  bool            true if acceptable, false if not
     */
    public static function validateString($str) {
        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * turn on/off trigram counting
     *
     * @access  public
     * @param   bool    $bool true for on, false for off
     */
    function prepareTrigram($bool = true)
    {
        $this->_compile_trigram = $bool;
    }

    /**
     * turn on/off unicode block counting
     *
     * @access  public
     * @param   bool    $bool true for on, false for off
     */
    function prepareUnicode($bool = true)
    {
        $this->_compile_unicode = $bool;
    }

    /**
     * turn on/off padding the beginning of the sample string
     *
     * @access  public
     * @param   bool    $bool true for on, false for off
     */
    function setPadStart($bool = true)
    {
        $this->_trigram_pad_start = $bool;
    }

    /**
     * Should the unicode block counter skip non-alphabetical ascii chars?
     *
     * @access  public
     * @param   bool    $bool true for on, false for off
     */
    function setUnicodeSkipSymbols($bool = true)
    {
        $this->_unicode_skip_symbols = $bool;
    }

    /**
     * Returns the trigram ranks for the text sample
     *
     * @access  public
     * @return  array    trigram ranks in the text sample
     */
    function &getTrigramRanks()
    {
        return $this->_trigram_ranks;
    }

    /**
     * Return the trigram freqency table
     *
     * only used in testing to make sure the parser is working
     *
     * @access  public
     * @return  array    trigram freqencies in the text sample
     */
    function &getTrigramFreqs()
    {
        return $this->_trigram;
    }

    /**
     * returns the array of unicode blocks
     *
     * @access  public
     * @return  array   unicode blocks in the text sample
     */
    function &getUnicodeBlocks()
    {
        return $this->_unicode_blocks;
    }

    /**
     * Executes the parsing operation
     * 
     * Be sure to call the set*() functions to set options and the 
     * prepare*() functions first to tell it what kind of data to compute
     *
     * Afterwards the get*() functions can be used to access the compiled
     * information.
     *
     * @access public
     */
    function analyze()
    {
        $len = strlen($this->_string);
        $byte_counter = 0;


        // unicode startup
        if ($this->_compile_unicode) {
            $blocks = $this->_read_unicode_block_db();
            $block_count = count($blocks);

            $skipped_count = 0;
            $unicode_chars = array();
        }

        // trigram startup
        if ($this->_compile_trigram) {
            // initialize them as blank so the parser will skip the first two
            // (since it skips trigrams with more than  2 contiguous spaces)
            $a = ' ';
            $b = ' ';

            // kludge
            // if it finds a valid trigram to start and the start pad option is
            // off, then set a variable that will be used to reduce this
            // trigram after parsing has finished
            if (!$this->_trigram_pad_start) {
                $a = $this->_next_char($this->_string, $byte_counter, true);

                if ($a != ' ') {
                    $b = $this->_next_char($this->_string, $byte_counter, true);
                    $dropone = " $a$b";
                }

                $byte_counter = 0;
                $a = ' ';
                $b = ' ';
            }
        }

        while ($byte_counter < $len) {
            $char = $this->_next_char($this->_string, $byte_counter, true);


            // language trigram detection
            if ($this->_compile_trigram) {
                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
                    if (!isset($this->_trigram[$a . $b . $char])) {
                       $this->_trigram[$a . $b . $char] = 1;
                    } else {
                       $this->_trigram[$a . $b . $char]++;
                    }
                }

                $a = $b;
                $b = $char;
            }

            // unicode block detection
            if ($this->_compile_unicode) {
                if ($this->_unicode_skip_symbols
                        && strlen($char) == 1
                        && ($char < 'A' || $char > 'z'
                        || ($char > 'Z' && $char < 'a'))
                        && $char != "'") {  // does not skip the apostrophe
                                            // since it's included in the language
                                            // models

                    $skipped_count++;
                    continue;
                }

                // build an array of all the characters
                if (isset($unicode_chars[$char])) {
                    $unicode_chars[$char]++;
                } else {
                    $unicode_chars[$char] = 1;
                }
            }

            // todo: add byte detection here
        }

        // unicode cleanup
        if ($this->_compile_unicode) {
            foreach ($unicode_chars as $utf8_char => $count) {
                $search_result = $this->_unicode_block_name(
                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);

                if ($search_result != -1) {
                    $block_name = $search_result[2];
                } else {
                    $block_name = '[Malformatted]';
                }

                if (isset($this->_unicode_blocks[$block_name])) {
                    $this->_unicode_blocks[$block_name] += $count;
                } else {
                    $this->_unicode_blocks[$block_name] = $count;
                }
            }
        }


        // trigram cleanup
        if ($this->_compile_trigram) {
            // pad the end
            if ($b != ' ') {
                if (!isset($this->_trigram["$a$b "])) {
                    $this->_trigram["$a$b "] = 1;
                } else {
                    $this->_trigram["$a$b "]++;
                }
            }

            // perl compatibility; Language::Guess does not pad the beginning
            // kludge
            if (isset($dropone)) {
                if ($this->_trigram[$dropone] == 1) {
                    unset($this->_trigram[$dropone]);
                } else {
                    $this->_trigram[$dropone]--;
                }
            }

            if (!empty($this->_trigram)) {
                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
            } else {
                $this->_trigram_ranks = array();
            }
        }
    }
}

/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
Commit	Line	Data
a50583fb MR	1	<?php
	2
	3	/**
	4	* This class represents a text sample to be parsed.
	5	*
	6	* @category Text
	7	* @package Text_LanguageDetect
	8	* @author Nicholas Pisarro
	9	* @copyright 2006
	10	* @license BSD
	11	* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
	12	* @link http://pear.php.net/package/Text_LanguageDetect/
	13	* @link http://langdetect.blogspot.com/
	14	*/
	15
	16	/**
	17	* This class represents a text sample to be parsed.
	18	*
	19	* This separates the analysis of a text sample from the primary LanguageDetect
	20	* class. After a new profile has been built, the data can be retrieved using
	21	* the accessor functions.
	22	*
	23	* This class is intended to be used by the Text_LanguageDetect class, not
	24	* end-users.
	25	*
	26	* @category Text
	27	* @package Text_LanguageDetect
	28	* @author Nicholas Pisarro
	29	* @copyright 2006
	30	* @license BSD
	31	* @version release: 0.3.0
	32	*/
	33	class Text_LanguageDetect_Parser extends Text_LanguageDetect
	34	{
	35	/**
	36	* the piece of text being parsed
	37	*
	38	* @access private
	39	* @var string
	40	*/
	41	var $_string;
	42
	43	/**
	44	* stores the trigram frequencies of the sample
	45	*
	46	* @access private
	47	* @var string
	48	*/
	49	var $_trigrams = array();
	50
	51	/**
	52	* stores the trigram ranks of the sample
	53	*
	54	* @access private
	55	* @var array
	56	*/
	57	var $_trigram_ranks = array();
	58
	59	/**
	60	* stores the unicode blocks of the sample
	61	*
	62	* @access private
	63	* @var array
	64	*/
65	var $_unicode_blocks = array();
66
67	/**
68	* Whether the parser should compile the unicode ranges
69	*
70	* @access private
71	* @var bool
72	*/
73	var $_compile_unicode = false;
74
75	/**
76	* Whether the parser should compile trigrams
77	*
78	* @access private
79	* @var bool
80	*/
81	var $_compile_trigram = false;
82
83	/**
84	* Whether the trigram parser should pad the beginning of the string
85	*
86	* @access private
87	* @var bool
88	*/
89	var $_trigram_pad_start = false;
90
91	/**
92	* Whether the unicode parser should skip non-alphabetical ascii chars
93	*
94	* @access private
95	* @var bool
96	*/
97	var $_unicode_skip_symbols = true;
98
99	/**
100	* Constructor
101	*
102	* @access private
103	* @param string $string string to be parsed
104	*/
105	function Text_LanguageDetect_Parser($string) {
106	$this->_string = $string;
107	}
108
109	/**
110	* Returns true if a string is suitable for parsing
111	*
112	* @param string $str input string to test
113	* @return bool true if acceptable, false if not
114	*/
115	public static function validateString($str) {
116	if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
117	return true;
118	} else {
119	return false;
120	}
121	}
122
123	/**
124	* turn on/off trigram counting
125	*
126	* @access public
127	* @param bool $bool true for on, false for off
128	*/
129	function prepareTrigram($bool = true)
130	{
131	$this->_compile_trigram = $bool;
132	}
133
134	/**
135	* turn on/off unicode block counting
136	*
137	* @access public
138	* @param bool $bool true for on, false for off
139	*/
140	function prepareUnicode($bool = true)
141	{
142	$this->_compile_unicode = $bool;
143	}
144
145	/**
146	* turn on/off padding the beginning of the sample string
147	*
148	* @access public
149	* @param bool $bool true for on, false for off
150	*/
151	function setPadStart($bool = true)
152	{
153	$this->_trigram_pad_start = $bool;
154	}
155
156	/**
157	* Should the unicode block counter skip non-alphabetical ascii chars?
158	*
159	* @access public
160	* @param bool $bool true for on, false for off
161	*/
162	function setUnicodeSkipSymbols($bool = true)
163	{
164	$this->_unicode_skip_symbols = $bool;
165	}
166
167	/**
168	* Returns the trigram ranks for the text sample
169	*
170	* @access public
171	* @return array trigram ranks in the text sample
172	*/
173	function &getTrigramRanks()
174	{
175	return $this->_trigram_ranks;
176	}
177
178	/**
179	* Return the trigram freqency table
180	*
181	* only used in testing to make sure the parser is working
182	*
183	* @access public
184	* @return array trigram freqencies in the text sample
185	*/
186	function &getTrigramFreqs()
187	{
188	return $this->_trigram;
189	}
190
191	/**
192	* returns the array of unicode blocks
193	*
194	* @access public
195	* @return array unicode blocks in the text sample
196	*/
197	function &getUnicodeBlocks()
198	{
199	return $this->_unicode_blocks;
200	}
201
202	/**
203	* Executes the parsing operation
204	*
205	* Be sure to call the set*() functions to set options and the
206	* prepare*() functions first to tell it what kind of data to compute
207	*
208	* Afterwards the get*() functions can be used to access the compiled
209	* information.
210	*
211	* @access public
212	*/
213	function analyze()
214	{
215	$len = strlen($this->_string);
216	$byte_counter = 0;
217
218
219	// unicode startup
220	if ($this->_compile_unicode) {
221	$blocks = $this->_read_unicode_block_db();
222	$block_count = count($blocks);
223
224	$skipped_count = 0;
225	$unicode_chars = array();
226	}
227
228	// trigram startup
229	if ($this->_compile_trigram) {
230	// initialize them as blank so the parser will skip the first two
231	// (since it skips trigrams with more than 2 contiguous spaces)
232	$a = ' ';
233	$b = ' ';
234
235	// kludge
236	// if it finds a valid trigram to start and the start pad option is
237	// off, then set a variable that will be used to reduce this
238	// trigram after parsing has finished
239	if (!$this->_trigram_pad_start) {
240	$a = $this->_next_char($this->_string, $byte_counter, true);
241
242	if ($a != ' ') {
243	$b = $this->_next_char($this->_string, $byte_counter, true);
244	$dropone = " $a$b";
245	}
246
247	$byte_counter = 0;
248	$a = ' ';
249	$b = ' ';
250	}
251	}
252
253	while ($byte_counter < $len) {
254	$char = $this->_next_char($this->_string, $byte_counter, true);
255
256
257	// language trigram detection
258	if ($this->_compile_trigram) {
259	if (!($b == ' ' && ($a == ' ' \|\| $char == ' '))) {
260	if (!isset($this->_trigram[$a . $b . $char])) {
261	$this->_trigram[$a . $b . $char] = 1;
262	} else {
263	$this->_trigram[$a . $b . $char]++;
264	}
265	}
266
267	$a = $b;
268	$b = $char;
269	}
270
271	// unicode block detection
272	if ($this->_compile_unicode) {
273	if ($this->_unicode_skip_symbols
274	&& strlen($char) == 1
275	&& ($char < 'A' \|\| $char > 'z'
276	\|\| ($char > 'Z' && $char < 'a'))
277	&& $char != "'") { // does not skip the apostrophe
278	// since it's included in the language
279	// models
280
281	$skipped_count++;
282	continue;
283	}
284
285	// build an array of all the characters
286	if (isset($unicode_chars[$char])) {
287	$unicode_chars[$char]++;
288	} else {
289	$unicode_chars[$char] = 1;
290	}
291	}
292
293	// todo: add byte detection here
294	}
295
296	// unicode cleanup
297	if ($this->_compile_unicode) {
298	foreach ($unicode_chars as $utf8_char => $count) {
299	$search_result = $this->_unicode_block_name(
300	$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
301
302	if ($search_result != -1) {
303	$block_name = $search_result[2];
304	} else {
305	$block_name = '[Malformatted]';
306	}
307
308	if (isset($this->_unicode_blocks[$block_name])) {
309	$this->_unicode_blocks[$block_name] += $count;
310	} else {
311	$this->_unicode_blocks[$block_name] = $count;
312	}
313	}
314	}
315
316
317	// trigram cleanup
318	if ($this->_compile_trigram) {
319	// pad the end
320	if ($b != ' ') {
321	if (!isset($this->_trigram["$a$b "])) {
322	$this->_trigram["$a$b "] = 1;
323	} else {
324	$this->_trigram["$a$b "]++;
325	}
326	}
327
328	// perl compatibility; Language::Guess does not pad the beginning
329	// kludge
330	if (isset($dropone)) {
331	if ($this->_trigram[$dropone] == 1) {
332	unset($this->_trigram[$dropone]);
333	} else {
334	$this->_trigram[$dropone]--;
335	}
336	}
337
338	if (!empty($this->_trigram)) {
339	$this->_trigram_ranks = $this->_arr_rank($this->_trigram);
340	} else {
341	$this->_trigram_ranks = array();
342	}
343	}
344	}
345	}
346
347	/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */