Merge pull request #712 from wallabag/dev1.7.0

1.7, call me "Premium version"
author: Nicolas Lœuillet <nicolas@loeuillet.org> 2014-05-29 18:54:06 +0200
committer: Nicolas Lœuillet <nicolas@loeuillet.org> 2014-05-29 18:54:06 +0200
commit: a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch)
tree: 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
parent: 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff)
parent: 8038b38802769031e050c753fc0a388a2276629e (diff)
download: wallabag-1.7.0.tar.gz
wallabag-1.7.0.tar.zst
wallabag-1.7.0.zip
1 files changed, 347 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
new file mode 100644
index 00000000..fb0e1e20
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -0,0 +1,347 @@
+<?php
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
+ * @link        http://pear.php.net/package/Text_LanguageDetect/
+ * @link        http://langdetect.blogspot.com/
+ */
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * This separates the analysis of a text sample from the primary LanguageDetect
+ * class. After a new profile has been built, the data can be retrieved using
+ * the accessor functions.
+ *
+ * This class is intended to be used by the Text_LanguageDetect class, not 
+ * end-users.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     release: 0.3.0
+ */
+class Text_LanguageDetect_Parser extends Text_LanguageDetect
+{
+    /**
+     * the piece of text being parsed
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_string;
+    /**
+     * stores the trigram frequencies of the sample
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_trigrams = array();
+    /**
+     * stores the trigram ranks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_trigram_ranks = array();
+    /**
+     * stores the unicode blocks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_unicode_blocks = array();
+    
+    /**
+     * Whether the parser should compile the unicode ranges
+     * 
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_unicode = false;
+    /**
+     * Whether the parser should compile trigrams
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_trigram = false;
+    /**
+     * Whether the trigram parser should pad the beginning of the string
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_trigram_pad_start = false;
+    /**
+     * Whether the unicode parser should skip non-alphabetical ascii chars
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_unicode_skip_symbols = true;
+    /**
+     * Constructor
+     *
+     * @access  private
+     * @param   string  $string     string to be parsed
+     */
+    function Text_LanguageDetect_Parser($string) {
+        $this->_string = $string;
+    }
+    /**
+     * Returns true if a string is suitable for parsing
+     *
+     * @param   string  $str    input string to test
+     * @return  bool            true if acceptable, false if not
+     */
+    public static function validateString($str) {
+        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+    /**
+     * turn on/off trigram counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareTrigram($bool = true)
+    {
+        $this->_compile_trigram = $bool;
+    }
+    /**
+     * turn on/off unicode block counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareUnicode($bool = true)
+    {
+        $this->_compile_unicode = $bool;
+    }
+    /**
+     * turn on/off padding the beginning of the sample string
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setPadStart($bool = true)
+    {
+        $this->_trigram_pad_start = $bool;
+    }
+    /**
+     * Should the unicode block counter skip non-alphabetical ascii chars?
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setUnicodeSkipSymbols($bool = true)
+    {
+        $this->_unicode_skip_symbols = $bool;
+    }
+    /**
+     * Returns the trigram ranks for the text sample
+     *
+     * @access  public
+     * @return  array    trigram ranks in the text sample
+     */
+    function &getTrigramRanks()
+    {
+        return $this->_trigram_ranks;
+    }
+    /**
+     * Return the trigram freqency table
+     *
+     * only used in testing to make sure the parser is working
+     *
+     * @access  public
+     * @return  array    trigram freqencies in the text sample
+     */
+    function &getTrigramFreqs()
+    {
+        return $this->_trigram;
+    }
+    /**
+     * returns the array of unicode blocks
+     *
+     * @access  public
+     * @return  array   unicode blocks in the text sample
+     */
+    function &getUnicodeBlocks()
+    {
+        return $this->_unicode_blocks;
+    }
+    /**
+     * Executes the parsing operation
+     * 
+     * Be sure to call the set*() functions to set options and the 
+     * prepare*() functions first to tell it what kind of data to compute
+     *
+     * Afterwards the get*() functions can be used to access the compiled
+     * information.
+     *
+     * @access public
+     */
+    function analyze()
+    {
+        $len = strlen($this->_string);
+        $byte_counter = 0;
+        // unicode startup
+        if ($this->_compile_unicode) {
+            $blocks = $this->_read_unicode_block_db();
+            $block_count = count($blocks);
+            $skipped_count = 0;
+            $unicode_chars = array();
+        }
+        // trigram startup
+        if ($this->_compile_trigram) {
+            // initialize them as blank so the parser will skip the first two
+            // (since it skips trigrams with more than  2 contiguous spaces)
+            $a = ' ';
+            $b = ' ';
+            // kludge
+            // if it finds a valid trigram to start and the start pad option is
+            // off, then set a variable that will be used to reduce this
+            // trigram after parsing has finished
+            if (!$this->_trigram_pad_start) {
+                $a = $this->_next_char($this->_string, $byte_counter, true);
+                if ($a != ' ') {
+                    $b = $this->_next_char($this->_string, $byte_counter, true);
+                    $dropone = " $a$b";
+                }
+                $byte_counter = 0;
+                $a = ' ';
+                $b = ' ';
+            }
+        }
+        while ($byte_counter < $len) {
+            $char = $this->_next_char($this->_string, $byte_counter, true);
+            // language trigram detection
+            if ($this->_compile_trigram) {
+                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
+                    if (!isset($this->_trigram[$a . $b . $char])) {
+                       $this->_trigram[$a . $b . $char] = 1;
+                    } else {
+                       $this->_trigram[$a . $b . $char]++;
+                    }
+                }
+                $a = $b;
+                $b = $char;
+            }
+            // unicode block detection
+            if ($this->_compile_unicode) {
+                if ($this->_unicode_skip_symbols
+                        && strlen($char) == 1
+                        && ($char < 'A' || $char > 'z'
+                        || ($char > 'Z' && $char < 'a'))
+                        && $char != "'") {  // does not skip the apostrophe
+                                            // since it's included in the language
+                                            // models
+                    $skipped_count++;
+                    continue;
+                }
+                // build an array of all the characters
+                if (isset($unicode_chars[$char])) {
+                    $unicode_chars[$char]++;
+                } else {
+                    $unicode_chars[$char] = 1;
+                }
+            }
+            // todo: add byte detection here
+        }
+        // unicode cleanup
+        if ($this->_compile_unicode) {
+            foreach ($unicode_chars as $utf8_char => $count) {
+                $search_result = $this->_unicode_block_name(
+                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+                if ($search_result != -1) {
+                    $block_name = $search_result[2];
+                } else {
+                    $block_name = '[Malformatted]';
+                }
+                if (isset($this->_unicode_blocks[$block_name])) {
+                    $this->_unicode_blocks[$block_name] += $count;
+                } else {
+                    $this->_unicode_blocks[$block_name] = $count;
+                }
+            }
+        }
+        // trigram cleanup
+        if ($this->_compile_trigram) {
+            // pad the end
+            if ($b != ' ') {
+                if (!isset($this->_trigram["$a$b "])) {
+                    $this->_trigram["$a$b "] = 1;
+                } else {
+                    $this->_trigram["$a$b "]++;
+                }
+            }
+            // perl compatibility; Language::Guess does not pad the beginning
+            // kludge
+            if (isset($dropone)) {
+                if ($this->_trigram[$dropone] == 1) {
+                    unset($this->_trigram[$dropone]);
+                } else {
+                    $this->_trigram[$dropone]--;
+                }
+            }
+            if (!empty($this->_trigram)) {
+                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
+            } else {
+                $this->_trigram_ranks = array();
+            }
+        }
+    }
+}
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+\ No newline at end of file
author	Nicolas Lœuillet <nicolas@loeuillet.org>	2014-05-29 18:54:06 +0200
committer	Nicolas Lœuillet <nicolas@loeuillet.org>	2014-05-29 18:54:06 +0200
commit	a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch)
tree	80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
parent	96834a47b09985e1c82b82857fc108f20e8b8f2b (diff)
parent	8038b38802769031e050c753fc0a388a2276629e (diff)
download	wallabag-1.7.0.tar.gz wallabag-1.7.0.tar.zst wallabag-1.7.0.zip

diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php new file mode 100644 index 00000000..fb0e1e20 --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -0,0 +1,347 @@
	1	<?php
	2
	3	/**
	4	* This class represents a text sample to be parsed.
	5	*
	6	* @category Text
	7	* @package Text_LanguageDetect
	8	* @author Nicholas Pisarro
	9	* @copyright 2006
	10	* @license BSD
	11	* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
	12	* @link http://pear.php.net/package/Text_LanguageDetect/
	13	* @link http://langdetect.blogspot.com/
	14	*/
	15
	16	/**
	17	* This class represents a text sample to be parsed.
	18	*
	19	* This separates the analysis of a text sample from the primary LanguageDetect
	20	* class. After a new profile has been built, the data can be retrieved using
	21	* the accessor functions.
	22	*
	23	* This class is intended to be used by the Text_LanguageDetect class, not
	24	* end-users.
	25	*
	26	* @category Text
	27	* @package Text_LanguageDetect
	28	* @author Nicholas Pisarro
	29	* @copyright 2006
	30	* @license BSD
	31	* @version release: 0.3.0
	32	*/
	33	class Text_LanguageDetect_Parser extends Text_LanguageDetect
	34	{
	35	/**
	36	* the piece of text being parsed
	37	*
	38	* @access private
	39	* @var string
	40	*/
	41	var $_string;
	42
	43	/**
	44	* stores the trigram frequencies of the sample
	45	*
	46	* @access private
	47	* @var string
	48	*/
	49	var $_trigrams = array();
	50
	51	/**
	52	* stores the trigram ranks of the sample
	53	*
	54	* @access private
	55	* @var array
	56	*/
	57	var $_trigram_ranks = array();
	58
	59	/**
	60	* stores the unicode blocks of the sample
	61	*
	62	* @access private
	63	* @var array
	64	*/
	65	var $_unicode_blocks = array();
	66
	67	/**
	68	* Whether the parser should compile the unicode ranges
	69	*
	70	* @access private
	71	* @var bool
	72	*/
	73	var $_compile_unicode = false;
	74
	75	/**
	76	* Whether the parser should compile trigrams
	77	*
	78	* @access private
	79	* @var bool
	80	*/
	81	var $_compile_trigram = false;
	82
	83	/**
	84	* Whether the trigram parser should pad the beginning of the string
	85	*
	86	* @access private
	87	* @var bool
	88	*/
	89	var $_trigram_pad_start = false;
	90
	91	/**
	92	* Whether the unicode parser should skip non-alphabetical ascii chars
	93	*
	94	* @access private
	95	* @var bool
	96	*/
	97	var $_unicode_skip_symbols = true;
	98
	99	/**
	100	* Constructor
	101	*
	102	* @access private
	103	* @param string $string string to be parsed
	104	*/
	105	function Text_LanguageDetect_Parser($string) {
	106	$this->_string = $string;
	107	}
	108
	109	/**
	110	* Returns true if a string is suitable for parsing
	111	*
	112	* @param string $str input string to test
	113	* @return bool true if acceptable, false if not
	114	*/
	115	public static function validateString($str) {
	116	if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
	117	return true;
	118	} else {
	119	return false;
	120	}
	121	}
	122
	123	/**
	124	* turn on/off trigram counting
	125	*
	126	* @access public
	127	* @param bool $bool true for on, false for off
	128	*/
	129	function prepareTrigram($bool = true)
	130	{
	131	$this->_compile_trigram = $bool;
	132	}
	133
	134	/**
	135	* turn on/off unicode block counting
	136	*
	137	* @access public
	138	* @param bool $bool true for on, false for off
	139	*/
	140	function prepareUnicode($bool = true)
	141	{
	142	$this->_compile_unicode = $bool;
	143	}
	144
	145	/**
	146	* turn on/off padding the beginning of the sample string
	147	*
	148	* @access public
	149	* @param bool $bool true for on, false for off
	150	*/
	151	function setPadStart($bool = true)
	152	{
	153	$this->_trigram_pad_start = $bool;
	154	}
	155
	156	/**
	157	* Should the unicode block counter skip non-alphabetical ascii chars?
	158	*
	159	* @access public
	160	* @param bool $bool true for on, false for off
	161	*/
	162	function setUnicodeSkipSymbols($bool = true)
	163	{
	164	$this->_unicode_skip_symbols = $bool;
	165	}
	166
	167	/**
	168	* Returns the trigram ranks for the text sample
	169	*
	170	* @access public
	171	* @return array trigram ranks in the text sample
	172	*/
	173	function &getTrigramRanks()
	174	{
	175	return $this->_trigram_ranks;
	176	}
	177
	178	/**
	179	* Return the trigram freqency table
	180	*
	181	* only used in testing to make sure the parser is working
	182	*
	183	* @access public
	184	* @return array trigram freqencies in the text sample
	185	*/
	186	function &getTrigramFreqs()
	187	{
	188	return $this->_trigram;
	189	}
	190
	191	/**
	192	* returns the array of unicode blocks
	193	*
	194	* @access public
	195	* @return array unicode blocks in the text sample
	196	*/
	197	function &getUnicodeBlocks()
	198	{
	199	return $this->_unicode_blocks;
	200	}
	201
	202	/**
	203	* Executes the parsing operation
	204	*
	205	* Be sure to call the set*() functions to set options and the
	206	* prepare*() functions first to tell it what kind of data to compute
	207	*
	208	* Afterwards the get*() functions can be used to access the compiled
	209	* information.
	210	*
	211	* @access public
	212	*/
	213	function analyze()
	214	{
	215	$len = strlen($this->_string);
	216	$byte_counter = 0;
	217
	218
	219	// unicode startup
	220	if ($this->_compile_unicode) {
	221	$blocks = $this->_read_unicode_block_db();
	222	$block_count = count($blocks);
	223
	224	$skipped_count = 0;
	225	$unicode_chars = array();
	226	}
	227
	228	// trigram startup
	229	if ($this->_compile_trigram) {
	230	// initialize them as blank so the parser will skip the first two
	231	// (since it skips trigrams with more than 2 contiguous spaces)
	232	$a = ' ';
	233	$b = ' ';
	234
	235	// kludge
	236	// if it finds a valid trigram to start and the start pad option is
	237	// off, then set a variable that will be used to reduce this
	238	// trigram after parsing has finished
	239	if (!$this->_trigram_pad_start) {
	240	$a = $this->_next_char($this->_string, $byte_counter, true);
	241
	242	if ($a != ' ') {
	243	$b = $this->_next_char($this->_string, $byte_counter, true);
	244	$dropone = " $a$b";
	245	}
	246
	247	$byte_counter = 0;
	248	$a = ' ';
	249	$b = ' ';
	250	}
	251	}
	252
	253	while ($byte_counter < $len) {
	254	$char = $this->_next_char($this->_string, $byte_counter, true);
	255
	256
	257	// language trigram detection
	258	if ($this->_compile_trigram) {
	259	if (!($b == ' ' && ($a == ' ' \|\| $char == ' '))) {
	260	if (!isset($this->_trigram[$a . $b . $char])) {
	261	$this->_trigram[$a . $b . $char] = 1;
	262	} else {
	263	$this->_trigram[$a . $b . $char]++;
	264	}
	265	}
	266
	267	$a = $b;
	268	$b = $char;
	269	}
	270
	271	// unicode block detection
	272	if ($this->_compile_unicode) {
	273	if ($this->_unicode_skip_symbols
	274	&& strlen($char) == 1
	275	&& ($char < 'A' \|\| $char > 'z'
	276	\|\| ($char > 'Z' && $char < 'a'))
	277	&& $char != "'") { // does not skip the apostrophe
	278	// since it's included in the language
	279	// models
	280
	281	$skipped_count++;
	282	continue;
	283	}
	284
	285	// build an array of all the characters
	286	if (isset($unicode_chars[$char])) {
	287	$unicode_chars[$char]++;
	288	} else {
	289	$unicode_chars[$char] = 1;
	290	}
	291	}
	292
	293	// todo: add byte detection here
	294	}
	295
	296	// unicode cleanup
	297	if ($this->_compile_unicode) {
	298	foreach ($unicode_chars as $utf8_char => $count) {
	299	$search_result = $this->_unicode_block_name(
	300	$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
	301
	302	if ($search_result != -1) {
	303	$block_name = $search_result[2];
	304	} else {
	305	$block_name = '[Malformatted]';
	306	}
	307
	308	if (isset($this->_unicode_blocks[$block_name])) {
	309	$this->_unicode_blocks[$block_name] += $count;
	310	} else {
	311	$this->_unicode_blocks[$block_name] = $count;
	312	}
	313	}
	314	}
	315
	316
	317	// trigram cleanup
	318	if ($this->_compile_trigram) {
	319	// pad the end
	320	if ($b != ' ') {
	321	if (!isset($this->_trigram["$a$b "])) {
	322	$this->_trigram["$a$b "] = 1;
	323	} else {
	324	$this->_trigram["$a$b "]++;
	325	}
	326	}
	327
	328	// perl compatibility; Language::Guess does not pad the beginning
	329	// kludge
	330	if (isset($dropone)) {
	331	if ($this->_trigram[$dropone] == 1) {
	332	unset($this->_trigram[$dropone]);
	333	} else {
	334	$this->_trigram[$dropone]--;
	335	}
	336	}
	337
	338	if (!empty($this->_trigram)) {
	339	$this->_trigram_ranks = $this->_arr_rank($this->_trigram);
	340	} else {
	341	$this->_trigram_ranks = array();
	342	}
	343	}
	344	}
	345	}
	346
	347	/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file