two small unimportant forgotten changes to 3.2 version of full-text-rss, issue #694

author: Maryana Rozhankivska <mariroz@mr.lviv.ua> 2014-05-23 19:25:48 +0300
committer: Maryana Rozhankivska <mariroz@mr.lviv.ua> 2014-05-23 19:25:48 +0300
commit: d18ff7d9565f982bc15c5930123992d44614e1e2 (patch)
tree: c72ba88c10f9b31d513d1e82b8fa9eeb34cb4b74 /inc/3rdparty/libraries/language-detect
parent: 3ec62cf95ab4436923d4c665fad7aef226cbb822 (diff)
download: wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.tar.gz
wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.tar.zst
wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.zip
1 files changed, 0 insertions, 354 deletions
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/Parser.php
deleted file mode 100644
index 7f15fa98..00000000
--- a/inc/3rdparty/libraries/language-detect/Parser.php
+++ /dev/null
@@ -1,354 +0,0 @@
-<?php
-/**
- * This class represents a text sample to be parsed.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
- * @link        http://pear.php.net/package/Text_LanguageDetect/
- * @link        http://langdetect.blogspot.com/
- */
-/**
- * This class represents a text sample to be parsed.
- *
- * This separates the analysis of a text sample from the primary LanguageDetect
- * class. After a new profile has been built, the data can be retrieved using
- * the accessor functions.
- *
- * This class is intended to be used by the Text_LanguageDetect class, not 
- * end-users.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     release: 0.2.3
- */
-class Text_LanguageDetect_Parser extends Text_LanguageDetect
-{
-    /**
-     * the piece of text being parsed
-     *
-     * @access  private
-     * @var     string
-     */
-    var $_string;
-    /**
-     * stores the trigram frequencies of the sample
-     *
-     * @access  private
-     * @var     string
-     */
-    var $_trigrams = array();
-    /**
-     * stores the trigram ranks of the sample
-     *
-     * @access  private
-     * @var     array
-     */
-    var $_trigram_ranks = array();
-    /**
-     * stores the unicode blocks of the sample
-     *
-     * @access  private
-     * @var     array
-     */
-    var $_unicode_blocks = array();
-    
-    /**
-     * Whether the parser should compile the unicode ranges
-     * 
-     * @access  private
-     * @var     bool
-     */
-    var $_compile_unicode = false;
-    /**
-     * Whether the parser should compile trigrams
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_compile_trigram = false;
-    /**
-     * Whether the trigram parser should pad the beginning of the string
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_trigram_pad_start = false;
-    /**
-     * Whether the unicode parser should skip non-alphabetical ascii chars
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_unicode_skip_symbols = true;
-    /**
-     * Constructor
-     *
-     * @access  private
-     * @param   string  $string     string to be parsed
-     */
-    function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
-                if (isset($db)) $this->_db_filename = $db;
-                if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;      
-        $this->_string = $string;
-    }
-    /**
-     * Returns true if a string is suitable for parsing
-     *
-     * @static
-     * @access  public
-     * @param   string  $str    input string to test
-     * @return  bool            true if acceptable, false if not
-     */
-    function validateString($str) {
-        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-    /**
-     * turn on/off trigram counting
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function prepareTrigram($bool = true)
-    {
-        $this->_compile_trigram = $bool;
-    }
-    /**
-     * turn on/off unicode block counting
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function prepareUnicode($bool = true)
-    {
-        $this->_compile_unicode = $bool;
-    }
-    /**
-     * turn on/off padding the beginning of the sample string
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function setPadStart($bool = true)
-    {
-        $this->_trigram_pad_start = $bool;
-    }
-    /**
-     * Should the unicode block counter skip non-alphabetical ascii chars?
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function setUnicodeSkipSymbols($bool = true)
-    {
-        $this->_unicode_skip_symbols = $bool;
-    }
-    /**
-     * Returns the trigram ranks for the text sample
-     *
-     * @access  public
-     * @return  array    trigram ranks in the text sample
-     */
-    function &getTrigramRanks()
-    {
-        return $this->_trigram_ranks;
-    }
-    /**
-     * Return the trigram freqency table
-     *
-     * only used in testing to make sure the parser is working
-     *
-     * @access  public
-     * @return  array    trigram freqencies in the text sample
-     */
-    function &getTrigramFreqs()
-    {
-        return $this->_trigram;
-    }
-    /**
-     * returns the array of unicode blocks
-     *
-     * @access  public
-     * @return  array   unicode blocks in the text sample
-     */
-    function &getUnicodeBlocks()
-    {
-        return $this->_unicode_blocks;
-    }
-    /**
-     * Executes the parsing operation
-     * 
-     * Be sure to call the set*() functions to set options and the 
-     * prepare*() functions first to tell it what kind of data to compute
-     *
-     * Afterwards the get*() functions can be used to access the compiled
-     * information.
-     *
-     * @access public
-     */
-    function analyze()
-    {
-        $len = strlen($this->_string);
-        $byte_counter = 0;
-        // unicode startup
-        if ($this->_compile_unicode) {
-            $blocks =& $this->_read_unicode_block_db();
-            $block_count = count($blocks);
-            $skipped_count = 0;
-            $unicode_chars = array();
-        }
-        // trigram startup
-        if ($this->_compile_trigram) {
-            // initialize them as blank so the parser will skip the first two
-            // (since it skips trigrams with more than  2 contiguous spaces)
-            $a = ' ';
-            $b = ' ';
-            // kludge
-            // if it finds a valid trigram to start and the start pad option is
-            // off, then set a variable that will be used to reduce this
-            // trigram after parsing has finished
-            if (!$this->_trigram_pad_start) {
-                $a = $this->_next_char($this->_string, $byte_counter, true);
-                if ($a != ' ') {
-                    $b = $this->_next_char($this->_string, $byte_counter, true);
-                    $dropone = " $a$b";
-                }
-                $byte_counter = 0;
-                $a = ' ';
-                $b = ' ';
-            }
-        }
-        while ($byte_counter < $len) {
-            $char = $this->_next_char($this->_string, $byte_counter, true);
-            // language trigram detection
-            if ($this->_compile_trigram) {
-                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
-                    if (!isset($this->_trigram[$a . $b . $char])) {
-                       $this->_trigram[$a . $b . $char] = 1;
-                    } else {
-                       $this->_trigram[$a . $b . $char]++;
-                    }
-                }
-                $a = $b;
-                $b = $char;
-            }
-            // unicode block detection
-            if ($this->_compile_unicode) {
-                if ($this->_unicode_skip_symbols
-                        && strlen($char) == 1
-                        && ($char < 'A' || $char > 'z'
-                        || ($char > 'Z' && $char < 'a'))
-                        && $char != "'") {  // does not skip the apostrophe
-                                            // since it's included in the language
-                                            // models
-                    $skipped_count++;
-                    continue;
-                }
-                // build an array of all the characters
-                if (isset($unicode_chars[$char])) {
-                    $unicode_chars[$char]++;
-                } else {
-                    $unicode_chars[$char] = 1;
-                }
-            }
-            // todo: add byte detection here
-        }
-        // unicode cleanup
-        if ($this->_compile_unicode) {
-            foreach ($unicode_chars as $utf8_char => $count) {
-                $search_result = $this->_unicode_block_name(
-                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
-                if ($search_result != -1) {
-                    $block_name = $search_result[2];
-                } else {
-                    $block_name = '[Malformatted]';
-                }
-                if (isset($this->_unicode_blocks[$block_name])) {
-                    $this->_unicode_blocks[$block_name] += $count;
-                } else {
-                    $this->_unicode_blocks[$block_name] = $count;
-                }
-            }
-        }
-        // trigram cleanup
-        if ($this->_compile_trigram) {
-            // pad the end
-            if ($b != ' ') {
-                if (!isset($this->_trigram["$a$b "])) {
-                    $this->_trigram["$a$b "] = 1;
-                } else {
-                    $this->_trigram["$a$b "]++;
-                }
-            }
-            // perl compatibility; Language::Guess does not pad the beginning
-            // kludge
-            if (isset($dropone)) {
-                if ($this->_trigram[$dropone] == 1) {
-                    unset($this->_trigram[$dropone]);
-                } else {
-                    $this->_trigram[$dropone]--;
-                }
-            }
-            if (!empty($this->_trigram)) {
-                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
-            } else {
-                $this->_trigram_ranks = array();
-            }
-        }
-    }
-}
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
-?>
author	Maryana Rozhankivska <mariroz@mr.lviv.ua>	2014-05-23 19:25:48 +0300
committer	Maryana Rozhankivska <mariroz@mr.lviv.ua>	2014-05-23 19:25:48 +0300
commit	d18ff7d9565f982bc15c5930123992d44614e1e2 (patch)
tree	c72ba88c10f9b31d513d1e82b8fa9eeb34cb4b74 /inc/3rdparty/libraries/language-detect
parent	3ec62cf95ab4436923d4c665fad7aef226cbb822 (diff)
download	wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.tar.gz wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.tar.zst wallabag-d18ff7d9565f982bc15c5930123992d44614e1e2.zip

diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/Parser.php deleted file mode 100644 index 7f15fa98..00000000 --- a/inc/3rdparty/libraries/language-detect/Parser.php +++ /dev/null
@@ -1,354 +0,0 @@
1	<?php
2
3	/**
4	* This class represents a text sample to be parsed.
5	*
6	* @category Text
7	* @package Text_LanguageDetect
8	* @author Nicholas Pisarro
9	* @copyright 2006
10	* @license BSD
11	* @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
12	* @link http://pear.php.net/package/Text_LanguageDetect/
13	* @link http://langdetect.blogspot.com/
14	*/
15
16	/**
17	* This class represents a text sample to be parsed.
18	*
19	* This separates the analysis of a text sample from the primary LanguageDetect
20	* class. After a new profile has been built, the data can be retrieved using
21	* the accessor functions.
22	*
23	* This class is intended to be used by the Text_LanguageDetect class, not
24	* end-users.
25	*
26	* @category Text
27	* @package Text_LanguageDetect
28	* @author Nicholas Pisarro
29	* @copyright 2006
30	* @license BSD
31	* @version release: 0.2.3
32	*/
33	class Text_LanguageDetect_Parser extends Text_LanguageDetect
34	{
35	/**
36	* the piece of text being parsed
37	*
38	* @access private
39	* @var string
40	*/
41	var $_string;
42
43	/**
44	* stores the trigram frequencies of the sample
45	*
46	* @access private
47	* @var string
48	*/
49	var $_trigrams = array();
50
51	/**
52	* stores the trigram ranks of the sample
53	*
54	* @access private
55	* @var array
56	*/
57	var $_trigram_ranks = array();
58
59	/**
60	* stores the unicode blocks of the sample
61	*
62	* @access private
63	* @var array
64	*/
65	var $_unicode_blocks = array();
66
67	/**
68	* Whether the parser should compile the unicode ranges
69	*
70	* @access private
71	* @var bool
72	*/
73	var $_compile_unicode = false;
74
75	/**
76	* Whether the parser should compile trigrams
77	*
78	* @access private
79	* @var bool
80	*/
81	var $_compile_trigram = false;
82
83	/**
84	* Whether the trigram parser should pad the beginning of the string
85	*
86	* @access private
87	* @var bool
88	*/
89	var $_trigram_pad_start = false;
90
91	/**
92	* Whether the unicode parser should skip non-alphabetical ascii chars
93	*
94	* @access private
95	* @var bool
96	*/
97	var $_unicode_skip_symbols = true;
98
99	/**
100	* Constructor
101	*
102	* @access private
103	* @param string $string string to be parsed
104	*/
105	function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
106	if (isset($db)) $this->_db_filename = $db;
107	if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
108	$this->_string = $string;
109	}
110
111	/**
112	* Returns true if a string is suitable for parsing
113	*
114	* @static
115	* @access public
116	* @param string $str input string to test
117	* @return bool true if acceptable, false if not
118	*/
119	function validateString($str) {
120	if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
121	return true;
122	} else {
123	return false;
124	}
125	}
126
127	/**
128	* turn on/off trigram counting
129	*
130	* @access public
131	* @param bool $bool true for on, false for off
132	*/
133	function prepareTrigram($bool = true)
134	{
135	$this->_compile_trigram = $bool;
136	}
137
138	/**
139	* turn on/off unicode block counting
140	*
141	* @access public
142	* @param bool $bool true for on, false for off
143	*/
144	function prepareUnicode($bool = true)
145	{
146	$this->_compile_unicode = $bool;
147	}
148
149	/**
150	* turn on/off padding the beginning of the sample string
151	*
152	* @access public
153	* @param bool $bool true for on, false for off
154	*/
155	function setPadStart($bool = true)
156	{
157	$this->_trigram_pad_start = $bool;
158	}
159
160	/**
161	* Should the unicode block counter skip non-alphabetical ascii chars?
162	*
163	* @access public
164	* @param bool $bool true for on, false for off
165	*/
166	function setUnicodeSkipSymbols($bool = true)
167	{
168	$this->_unicode_skip_symbols = $bool;
169	}
170
171	/**
172	* Returns the trigram ranks for the text sample
173	*
174	* @access public
175	* @return array trigram ranks in the text sample
176	*/
177	function &getTrigramRanks()
178	{
179	return $this->_trigram_ranks;
180	}
181
182	/**
183	* Return the trigram freqency table
184	*
185	* only used in testing to make sure the parser is working
186	*
187	* @access public
188	* @return array trigram freqencies in the text sample
189	*/
190	function &getTrigramFreqs()
191	{
192	return $this->_trigram;
193	}
194
195	/**
196	* returns the array of unicode blocks
197	*
198	* @access public
199	* @return array unicode blocks in the text sample
200	*/
201	function &getUnicodeBlocks()
202	{
203	return $this->_unicode_blocks;
204	}
205
206	/**
207	* Executes the parsing operation
208	*
209	* Be sure to call the set*() functions to set options and the
210	* prepare*() functions first to tell it what kind of data to compute
211	*
212	* Afterwards the get*() functions can be used to access the compiled
213	* information.
214	*
215	* @access public
216	*/
217	function analyze()
218	{
219	$len = strlen($this->_string);
220	$byte_counter = 0;
221
222
223	// unicode startup
224	if ($this->_compile_unicode) {
225	$blocks =& $this->_read_unicode_block_db();
226
227	$block_count = count($blocks);
228
229	$skipped_count = 0;
230	$unicode_chars = array();
231	}
232
233	// trigram startup
234	if ($this->_compile_trigram) {
235	// initialize them as blank so the parser will skip the first two
236	// (since it skips trigrams with more than 2 contiguous spaces)
237	$a = ' ';
238	$b = ' ';
239
240	// kludge
241	// if it finds a valid trigram to start and the start pad option is
242	// off, then set a variable that will be used to reduce this
243	// trigram after parsing has finished
244	if (!$this->_trigram_pad_start) {
245	$a = $this->_next_char($this->_string, $byte_counter, true);
246
247	if ($a != ' ') {
248	$b = $this->_next_char($this->_string, $byte_counter, true);
249	$dropone = " $a$b";
250	}
251
252	$byte_counter = 0;
253	$a = ' ';
254	$b = ' ';
255	}
256	}
257
258	while ($byte_counter < $len) {
259	$char = $this->_next_char($this->_string, $byte_counter, true);
260
261
262	// language trigram detection
263	if ($this->_compile_trigram) {
264	if (!($b == ' ' && ($a == ' ' \|\| $char == ' '))) {
265	if (!isset($this->_trigram[$a . $b . $char])) {
266	$this->_trigram[$a . $b . $char] = 1;
267	} else {
268	$this->_trigram[$a . $b . $char]++;
269	}
270	}
271
272	$a = $b;
273	$b = $char;
274	}
275
276	// unicode block detection
277	if ($this->_compile_unicode) {
278	if ($this->_unicode_skip_symbols
279	&& strlen($char) == 1
280	&& ($char < 'A' \|\| $char > 'z'
281	\|\| ($char > 'Z' && $char < 'a'))
282	&& $char != "'") { // does not skip the apostrophe
283	// since it's included in the language
284	// models
285
286	$skipped_count++;
287	continue;
288	}
289
290	// build an array of all the characters
291	if (isset($unicode_chars[$char])) {
292	$unicode_chars[$char]++;
293	} else {
294	$unicode_chars[$char] = 1;
295	}
296	}
297
298	// todo: add byte detection here
299	}
300
301	// unicode cleanup
302	if ($this->_compile_unicode) {
303	foreach ($unicode_chars as $utf8_char => $count) {
304	$search_result = $this->_unicode_block_name(
305	$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
306
307	if ($search_result != -1) {
308	$block_name = $search_result[2];
309	} else {
310	$block_name = '[Malformatted]';
311	}
312
313	if (isset($this->_unicode_blocks[$block_name])) {
314	$this->_unicode_blocks[$block_name] += $count;
315	} else {
316	$this->_unicode_blocks[$block_name] = $count;
317	}
318	}
319	}
320
321
322	// trigram cleanup
323	if ($this->_compile_trigram) {
324	// pad the end
325	if ($b != ' ') {
326	if (!isset($this->_trigram["$a$b "])) {
327	$this->_trigram["$a$b "] = 1;
328	} else {
329	$this->_trigram["$a$b "]++;
330	}
331	}
332
333	// perl compatibility; Language::Guess does not pad the beginning
334	// kludge
335	if (isset($dropone)) {
336	if ($this->_trigram[$dropone] == 1) {
337	unset($this->_trigram[$dropone]);
338	} else {
339	$this->_trigram[$dropone]--;
340	}
341	}
342
343	if (!empty($this->_trigram)) {
344	$this->_trigram_ranks = $this->_arr_rank($this->_trigram);
345	} else {
346	$this->_trigram_ranks = array();
347	}
348	}
349	}
350	}
351
352	/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
353
354	?>