last 3 important changes to 3.2 version of full-text-rss, issue #694

author: Maryana Rozhankivska <mariroz@mr.lviv.ua> 2014-05-23 19:27:17 +0300
committer: Maryana Rozhankivska <mariroz@mr.lviv.ua> 2014-05-23 19:27:17 +0300
commit: a50583fb97615f4c26cc84ee95d62f867a84b4e6 (patch)
tree: 09f43e6a4d2d93e90daa86bbcc69ea1afcf524e6 /inc
parent: d18ff7d9565f982bc15c5930123992d44614e1e2 (diff)
download: wallabag-a50583fb97615f4c26cc84ee95d62f867a84b4e6.tar.gz
wallabag-a50583fb97615f4c26cc84ee95d62f867a84b4e6.tar.zst
wallabag-a50583fb97615f4c26cc84ee95d62f867a84b4e6.zip
3 files changed, 743 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
+<?php
+class Text_LanguageDetect_Exception extends Exception
+{
+    /**
+     * Database file could not be found
+     */
+    const DB_NOT_FOUND = 10;
+    /**
+     * Database file found, but not readable
+     */
+    const DB_NOT_READABLE = 11;
+    /**
+     * Database file is empty
+     */
+    const DB_EMPTY = 12;
+    /**
+     * Database contents is not a PHP array
+     */
+    const DB_NOT_ARRAY = 13;
+    /**
+     * Magic quotes are activated
+     */
+    const MAGIC_QUOTES = 14;
+    /**
+     * Parameter of invalid type passed to method
+     */
+    const PARAM_TYPE = 20;
+    /**
+     * Character in parameter is invalid
+     */
+    const INVALID_CHAR = 21;
+    /**
+     * Language is not in the database
+     */
+    const UNKNOWN_LANGUAGE = 30;
+    /**
+     * Error during block detection
+     */
+    const BLOCK_DETECTION = 40;
+    /**
+     * Error while clustering languages
+     */
+    const NO_HIGHEST_KEY = 50;
+}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
+<?php
+/**
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
+ *
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Christian Weiske <cweiske@php.net>
+ * @copyright 2011 Christian Weiske <cweiske@php.net>
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   SVN: $Id$
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ */
+/**
+ * Provides a mapping between the languages from lang.dat and the
+ * ISO 639-1 and ISO-639-2 codes.
+ *
+ * Note that this class contains only languages that exist in lang.dat.
+ *
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Christian Weiske <cweiske@php.net>
+ * @copyright 2011 Christian Weiske <cweiske@php.net>
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @link      http://www.loc.gov/standards/iso639-2/php/code_list.php
+ */
+class Text_LanguageDetect_ISO639
+{
+    /**
+     * Maps all language names from the language database to the
+     * ISO 639-1 2-letter language code.
+     *
+     * NULL indicates that there is no 2-letter code.
+     *
+     * @var array
+     */
+    public static $nameToCode2 = array(
+        'albanian'   => 'sq',
+        'arabic'     => 'ar',
+        'azeri'      => 'az',
+        'bengali'    => 'bn',
+        'bulgarian'  => 'bg',
+        'cebuano'    => null,
+        'croatian'   => 'hr',
+        'czech'      => 'cs',
+        'danish'     => 'da',
+        'dutch'      => 'nl',
+        'english'    => 'en',
+        'estonian'   => 'et',
+        'farsi'      => 'fa',
+        'finnish'    => 'fi',
+        'french'     => 'fr',
+        'german'     => 'de',
+        'hausa'      => 'ha',
+        'hawaiian'   => null,
+        'hindi'      => 'hi',
+        'hungarian'  => 'hu',
+        'icelandic'  => 'is',
+        'indonesian' => 'id',
+        'italian'    => 'it',
+        'kazakh'     => 'kk',
+        'kyrgyz'     => 'ky',
+        'latin'      => 'la',
+        'latvian'    => 'lv',
+        'lithuanian' => 'lt',
+        'macedonian' => 'mk',
+        'mongolian'  => 'mn',
+        'nepali'     => 'ne',
+        'norwegian'  => 'no',
+        'pashto'     => 'ps',
+        'pidgin'     => null,
+        'polish'     => 'pl',
+        'portuguese' => 'pt',
+        'romanian'   => 'ro',
+        'russian'    => 'ru',
+        'serbian'    => 'sr',
+        'slovak'     => 'sk',
+        'slovene'    => 'sl',
+        'somali'     => 'so',
+        'spanish'    => 'es',
+        'swahili'    => 'sw',
+        'swedish'    => 'sv',
+        'tagalog'    => 'tl',
+        'turkish'    => 'tr',
+        'ukrainian'  => 'uk',
+        'urdu'       => 'ur',
+        'uzbek'      => 'uz',
+        'vietnamese' => 'vi',
+        'welsh'      => 'cy',
+    );
+    /**
+     * Maps all language names from the language database to the
+     * ISO 639-2 3-letter language code.
+     *
+     * @var array
+     */
+    public static $nameToCode3 = array(
+        'albanian'   => 'sqi',
+        'arabic'     => 'ara',
+        'azeri'      => 'aze',
+        'bengali'    => 'ben',
+        'bulgarian'  => 'bul',
+        'cebuano'    => 'ceb',
+        'croatian'   => 'hrv',
+        'czech'      => 'ces',
+        'danish'     => 'dan',
+        'dutch'      => 'nld',
+        'english'    => 'eng',
+        'estonian'   => 'est',
+        'farsi'      => 'fas',
+        'finnish'    => 'fin',
+        'french'     => 'fra',
+        'german'     => 'deu',
+        'hausa'      => 'hau',
+        'hawaiian'   => 'haw',
+        'hindi'      => 'hin',
+        'hungarian'  => 'hun',
+        'icelandic'  => 'isl',
+        'indonesian' => 'ind',
+        'italian'    => 'ita',
+        'kazakh'     => 'kaz',
+        'kyrgyz'     => 'kir',
+        'latin'      => 'lat',
+        'latvian'    => 'lav',
+        'lithuanian' => 'lit',
+        'macedonian' => 'mkd',
+        'mongolian'  => 'mon',
+        'nepali'     => 'nep',
+        'norwegian'  => 'nor',
+        'pashto'     => 'pus',
+        'pidgin'     => 'crp',
+        'polish'     => 'pol',
+        'portuguese' => 'por',
+        'romanian'   => 'ron',
+        'russian'    => 'rus',
+        'serbian'    => 'srp',
+        'slovak'     => 'slk',
+        'slovene'    => 'slv',
+        'somali'     => 'som',
+        'spanish'    => 'spa',
+        'swahili'    => 'swa',
+        'swedish'    => 'swe',
+        'tagalog'    => 'tgl',
+        'turkish'    => 'tur',
+        'ukrainian'  => 'ukr',
+        'urdu'       => 'urd',
+        'uzbek'      => 'uzb',
+        'vietnamese' => 'vie',
+        'welsh'      => 'cym',
+    );
+    /**
+     * Maps ISO 639-1 2-letter language codes to the language names
+     * in the language database
+     *
+     * Not all languages have a 2 letter code, so some are missing
+     *
+     * @var array
+     */
+    public static $code2ToName = array(
+        'ar' => 'arabic',
+        'az' => 'azeri',
+        'bg' => 'bulgarian',
+        'bn' => 'bengali',
+        'cs' => 'czech',
+        'cy' => 'welsh',
+        'da' => 'danish',
+        'de' => 'german',
+        'en' => 'english',
+        'es' => 'spanish',
+        'et' => 'estonian',
+        'fa' => 'farsi',
+        'fi' => 'finnish',
+        'fr' => 'french',
+        'ha' => 'hausa',
+        'hi' => 'hindi',
+        'hr' => 'croatian',
+        'hu' => 'hungarian',
+        'id' => 'indonesian',
+        'is' => 'icelandic',
+        'it' => 'italian',
+        'kk' => 'kazakh',
+        'ky' => 'kyrgyz',
+        'la' => 'latin',
+        'lt' => 'lithuanian',
+        'lv' => 'latvian',
+        'mk' => 'macedonian',
+        'mn' => 'mongolian',
+        'ne' => 'nepali',
+        'nl' => 'dutch',
+        'no' => 'norwegian',
+        'pl' => 'polish',
+        'ps' => 'pashto',
+        'pt' => 'portuguese',
+        'ro' => 'romanian',
+        'ru' => 'russian',
+        'sk' => 'slovak',
+        'sl' => 'slovene',
+        'so' => 'somali',
+        'sq' => 'albanian',
+        'sr' => 'serbian',
+        'sv' => 'swedish',
+        'sw' => 'swahili',
+        'tl' => 'tagalog',
+        'tr' => 'turkish',
+        'uk' => 'ukrainian',
+        'ur' => 'urdu',
+        'uz' => 'uzbek',
+        'vi' => 'vietnamese',
+    );
+    /**
+     * Maps ISO 639-2 3-letter language codes to the language names
+     * in the language database.
+     *
+     * @var array
+     */
+    public static $code3ToName = array(
+        'ara' => 'arabic',
+        'aze' => 'azeri',
+        'ben' => 'bengali',
+        'bul' => 'bulgarian',
+        'ceb' => 'cebuano',
+        'ces' => 'czech',
+        'crp' => 'pidgin',
+        'cym' => 'welsh',
+        'dan' => 'danish',
+        'deu' => 'german',
+        'eng' => 'english',
+        'est' => 'estonian',
+        'fas' => 'farsi',
+        'fin' => 'finnish',
+        'fra' => 'french',
+        'hau' => 'hausa',
+        'haw' => 'hawaiian',
+        'hin' => 'hindi',
+        'hrv' => 'croatian',
+        'hun' => 'hungarian',
+        'ind' => 'indonesian',
+        'isl' => 'icelandic',
+        'ita' => 'italian',
+        'kaz' => 'kazakh',
+        'kir' => 'kyrgyz',
+        'lat' => 'latin',
+        'lav' => 'latvian',
+        'lit' => 'lithuanian',
+        'mkd' => 'macedonian',
+        'mon' => 'mongolian',
+        'nep' => 'nepali',
+        'nld' => 'dutch',
+        'nor' => 'norwegian',
+        'pol' => 'polish',
+        'por' => 'portuguese',
+        'pus' => 'pashto',
+        'rom' => 'romanian',
+        'rus' => 'russian',
+        'slk' => 'slovak',
+        'slv' => 'slovene',
+        'som' => 'somali',
+        'spa' => 'spanish',
+        'sqi' => 'albanian',
+        'srp' => 'serbian',
+        'swa' => 'swahili',
+        'swe' => 'swedish',
+        'tgl' => 'tagalog',
+        'tur' => 'turkish',
+        'ukr' => 'ukrainian',
+        'urd' => 'urdu',
+        'uzb' => 'uzbek',
+        'vie' => 'vietnamese',
+    );
+    /**
+     * Returns the 2-letter ISO 639-1 code for the given language name.
+     *
+     * @param string $lang English language name like "swedish"
+     *
+     * @return string Two-letter language code (e.g. "sv") or NULL if not found
+     */
+    public static function nameToCode2($lang)
+    {
+        $lang = strtolower($lang);
+        if (!isset(self::$nameToCode2[$lang])) {
+            return null;
+        }
+        return self::$nameToCode2[$lang];
+    }
+    /**
+     * Returns the 3-letter ISO 639-2 code for the given language name.
+     *
+     * @param string $lang English language name like "swedish"
+     *
+     * @return string Three-letter language code (e.g. "swe") or NULL if not found
+     */
+    public static function nameToCode3($lang)
+    {
+        $lang = strtolower($lang);
+        if (!isset(self::$nameToCode3[$lang])) {
+            return null;
+        }
+        return self::$nameToCode3[$lang];
+    }
+    /**
+     * Returns the language name for the given 2-letter ISO 639-1 code.
+     *
+     * @param string $code Two-letter language code (e.g. "sv")
+     *
+     * @return string English language name like "swedish"
+     */
+    public static function code2ToName($code)
+    {
+        $lang = strtolower($code);
+        if (!isset(self::$code2ToName[$code])) {
+            return null;
+        }
+        return self::$code2ToName[$code];
+    }
+    /**
+     * Returns the language name for the given 3-letter ISO 639-2 code.
+     *
+     * @param string $code Three-letter language code (e.g. "swe")
+     *
+     * @return string English language name like "swedish"
+     */
+    public static function code3ToName($code)
+    {
+        $lang = strtolower($code);
+        if (!isset(self::$code3ToName[$code])) {
+            return null;
+        }
+        return self::$code3ToName[$code];
+    }
+}
+\ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
new file mode 100644
index 00000000..fb0e1e20
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -0,0 +1,347 @@
+<?php
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
+ * @link        http://pear.php.net/package/Text_LanguageDetect/
+ * @link        http://langdetect.blogspot.com/
+ */
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * This separates the analysis of a text sample from the primary LanguageDetect
+ * class. After a new profile has been built, the data can be retrieved using
+ * the accessor functions.
+ *
+ * This class is intended to be used by the Text_LanguageDetect class, not 
+ * end-users.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     release: 0.3.0
+ */
+class Text_LanguageDetect_Parser extends Text_LanguageDetect
+{
+    /**
+     * the piece of text being parsed
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_string;
+    /**
+     * stores the trigram frequencies of the sample
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_trigrams = array();
+    /**
+     * stores the trigram ranks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_trigram_ranks = array();
+    /**
+     * stores the unicode blocks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_unicode_blocks = array();
+    
+    /**
+     * Whether the parser should compile the unicode ranges
+     * 
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_unicode = false;
+    /**
+     * Whether the parser should compile trigrams
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_trigram = false;
+    /**
+     * Whether the trigram parser should pad the beginning of the string
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_trigram_pad_start = false;
+    /**
+     * Whether the unicode parser should skip non-alphabetical ascii chars
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_unicode_skip_symbols = true;
+    /**
+     * Constructor
+     *
+     * @access  private
+     * @param   string  $string     string to be parsed
+     */
+    function Text_LanguageDetect_Parser($string) {
+        $this->_string = $string;
+    }
+    /**
+     * Returns true if a string is suitable for parsing
+     *
+     * @param   string  $str    input string to test
+     * @return  bool            true if acceptable, false if not
+     */
+    public static function validateString($str) {
+        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+    /**
+     * turn on/off trigram counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareTrigram($bool = true)
+    {
+        $this->_compile_trigram = $bool;
+    }
+    /**
+     * turn on/off unicode block counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareUnicode($bool = true)
+    {
+        $this->_compile_unicode = $bool;
+    }
+    /**
+     * turn on/off padding the beginning of the sample string
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setPadStart($bool = true)
+    {
+        $this->_trigram_pad_start = $bool;
+    }
+    /**
+     * Should the unicode block counter skip non-alphabetical ascii chars?
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setUnicodeSkipSymbols($bool = true)
+    {
+        $this->_unicode_skip_symbols = $bool;
+    }
+    /**
+     * Returns the trigram ranks for the text sample
+     *
+     * @access  public
+     * @return  array    trigram ranks in the text sample
+     */
+    function &getTrigramRanks()
+    {
+        return $this->_trigram_ranks;
+    }
+    /**
+     * Return the trigram freqency table
+     *
+     * only used in testing to make sure the parser is working
+     *
+     * @access  public
+     * @return  array    trigram freqencies in the text sample
+     */
+    function &getTrigramFreqs()
+    {
+        return $this->_trigram;
+    }
+    /**
+     * returns the array of unicode blocks
+     *
+     * @access  public
+     * @return  array   unicode blocks in the text sample
+     */
+    function &getUnicodeBlocks()
+    {
+        return $this->_unicode_blocks;
+    }
+    /**
+     * Executes the parsing operation
+     * 
+     * Be sure to call the set*() functions to set options and the 
+     * prepare*() functions first to tell it what kind of data to compute
+     *
+     * Afterwards the get*() functions can be used to access the compiled
+     * information.
+     *
+     * @access public
+     */
+    function analyze()
+    {
+        $len = strlen($this->_string);
+        $byte_counter = 0;
+        // unicode startup
+        if ($this->_compile_unicode) {
+            $blocks = $this->_read_unicode_block_db();
+            $block_count = count($blocks);
+            $skipped_count = 0;
+            $unicode_chars = array();
+        }
+        // trigram startup
+        if ($this->_compile_trigram) {
+            // initialize them as blank so the parser will skip the first two
+            // (since it skips trigrams with more than  2 contiguous spaces)
+            $a = ' ';
+            $b = ' ';
+            // kludge
+            // if it finds a valid trigram to start and the start pad option is
+            // off, then set a variable that will be used to reduce this
+            // trigram after parsing has finished
+            if (!$this->_trigram_pad_start) {
+                $a = $this->_next_char($this->_string, $byte_counter, true);
+                if ($a != ' ') {
+                    $b = $this->_next_char($this->_string, $byte_counter, true);
+                    $dropone = " $a$b";
+                }
+                $byte_counter = 0;
+                $a = ' ';
+                $b = ' ';
+            }
+        }
+        while ($byte_counter < $len) {
+            $char = $this->_next_char($this->_string, $byte_counter, true);
+            // language trigram detection
+            if ($this->_compile_trigram) {
+                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
+                    if (!isset($this->_trigram[$a . $b . $char])) {
+                       $this->_trigram[$a . $b . $char] = 1;
+                    } else {
+                       $this->_trigram[$a . $b . $char]++;
+                    }
+                }
+                $a = $b;
+                $b = $char;
+            }
+            // unicode block detection
+            if ($this->_compile_unicode) {
+                if ($this->_unicode_skip_symbols
+                        && strlen($char) == 1
+                        && ($char < 'A' || $char > 'z'
+                        || ($char > 'Z' && $char < 'a'))
+                        && $char != "'") {  // does not skip the apostrophe
+                                            // since it's included in the language
+                                            // models
+                    $skipped_count++;
+                    continue;
+                }
+                // build an array of all the characters
+                if (isset($unicode_chars[$char])) {
+                    $unicode_chars[$char]++;
+                } else {
+                    $unicode_chars[$char] = 1;
+                }
+            }
+            // todo: add byte detection here
+        }
+        // unicode cleanup
+        if ($this->_compile_unicode) {
+            foreach ($unicode_chars as $utf8_char => $count) {
+                $search_result = $this->_unicode_block_name(
+                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+                if ($search_result != -1) {
+                    $block_name = $search_result[2];
+                } else {
+                    $block_name = '[Malformatted]';
+                }
+                if (isset($this->_unicode_blocks[$block_name])) {
+                    $this->_unicode_blocks[$block_name] += $count;
+                } else {
+                    $this->_unicode_blocks[$block_name] = $count;
+                }
+            }
+        }
+        // trigram cleanup
+        if ($this->_compile_trigram) {
+            // pad the end
+            if ($b != ' ') {
+                if (!isset($this->_trigram["$a$b "])) {
+                    $this->_trigram["$a$b "] = 1;
+                } else {
+                    $this->_trigram["$a$b "]++;
+                }
+            }
+            // perl compatibility; Language::Guess does not pad the beginning
+            // kludge
+            if (isset($dropone)) {
+                if ($this->_trigram[$dropone] == 1) {
+                    unset($this->_trigram[$dropone]);
+                } else {
+                    $this->_trigram[$dropone]--;
+                }
+            }
+            if (!empty($this->_trigram)) {
+                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
+            } else {
+                $this->_trigram_ranks = array();
+            }
+        }
+    }
+}
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+\ No newline at end of file
author	Maryana Rozhankivska <mariroz@mr.lviv.ua>	2014-05-23 19:27:17 +0300
committer	Maryana Rozhankivska <mariroz@mr.lviv.ua>	2014-05-23 19:27:17 +0300
commit	a50583fb97615f4c26cc84ee95d62f867a84b4e6 (patch)
tree	09f43e6a4d2d93e90daa86bbcc69ea1afcf524e6 /inc
parent	d18ff7d9565f982bc15c5930123992d44614e1e2 (diff)
download	wallabag-a50583fb97615f4c26cc84ee95d62f867a84b4e6.tar.gz wallabag-a50583fb97615f4c26cc84ee95d62f867a84b4e6.tar.zst wallabag-a50583fb97615f4c26cc84ee95d62f867a84b4e6.zip

diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php new file mode 100644 index 00000000..196d994f --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
	1	<?php
	2	class Text_LanguageDetect_Exception extends Exception
	3	{
	4	/**
	5	* Database file could not be found
	6	*/
	7	const DB_NOT_FOUND = 10;
	8
	9	/**
	10	* Database file found, but not readable
	11	*/
	12	const DB_NOT_READABLE = 11;
	13
	14	/**
	15	* Database file is empty
	16	*/
	17	const DB_EMPTY = 12;
	18
	19	/**
	20	* Database contents is not a PHP array
	21	*/
	22	const DB_NOT_ARRAY = 13;
	23
	24	/**
	25	* Magic quotes are activated
	26	*/
	27	const MAGIC_QUOTES = 14;
	28
	29
	30	/**
	31	* Parameter of invalid type passed to method
	32	*/
	33	const PARAM_TYPE = 20;
	34
	35	/**
	36	* Character in parameter is invalid
	37	*/
	38	const INVALID_CHAR = 21;
	39
	40
	41	/**
	42	* Language is not in the database
	43	*/
	44	const UNKNOWN_LANGUAGE = 30;
	45
	46
	47	/**
	48	* Error during block detection
	49	*/
	50	const BLOCK_DETECTION = 40;
	51
	52
	53	/**
	54	* Error while clustering languages
	55	*/
	56	const NO_HIGHEST_KEY = 50;
	57	}


diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php new file mode 100644 index 00000000..05b0590d --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
	1	<?php
	2	/**
	3	* Part of Text_LanguageDetect
	4	*
	5	* PHP version 5
	6	*
	7	* @category Text
	8	* @package Text_LanguageDetect
	9	* @author Christian Weiske <cweiske@php.net>
	10	* @copyright 2011 Christian Weiske <cweiske@php.net>
	11	* @license http://www.debian.org/misc/bsd.license BSD
	12	* @version SVN: $Id$
	13	* @link http://pear.php.net/package/Text_LanguageDetect/
	14	*/
	15
	16	/**
	17	* Provides a mapping between the languages from lang.dat and the
	18	* ISO 639-1 and ISO-639-2 codes.
	19	*
	20	* Note that this class contains only languages that exist in lang.dat.
	21	*
	22	* @category Text
	23	* @package Text_LanguageDetect
	24	* @author Christian Weiske <cweiske@php.net>
	25	* @copyright 2011 Christian Weiske <cweiske@php.net>
	26	* @license http://www.debian.org/misc/bsd.license BSD
	27	* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
	28	*/
	29	class Text_LanguageDetect_ISO639
	30	{
	31	/**
	32	* Maps all language names from the language database to the
	33	* ISO 639-1 2-letter language code.
	34	*
	35	* NULL indicates that there is no 2-letter code.
	36	*
	37	* @var array
	38	*/
	39	public static $nameToCode2 = array(
	40	'albanian' => 'sq',
	41	'arabic' => 'ar',
	42	'azeri' => 'az',
	43	'bengali' => 'bn',
	44	'bulgarian' => 'bg',
	45	'cebuano' => null,
	46	'croatian' => 'hr',
	47	'czech' => 'cs',
	48	'danish' => 'da',
	49	'dutch' => 'nl',
	50	'english' => 'en',
	51	'estonian' => 'et',
	52	'farsi' => 'fa',
	53	'finnish' => 'fi',
	54	'french' => 'fr',
	55	'german' => 'de',
	56	'hausa' => 'ha',
	57	'hawaiian' => null,
	58	'hindi' => 'hi',
	59	'hungarian' => 'hu',
	60	'icelandic' => 'is',
	61	'indonesian' => 'id',
	62	'italian' => 'it',
	63	'kazakh' => 'kk',
	64	'kyrgyz' => 'ky',
	65	'latin' => 'la',
	66	'latvian' => 'lv',
	67	'lithuanian' => 'lt',
	68	'macedonian' => 'mk',
	69	'mongolian' => 'mn',
	70	'nepali' => 'ne',
	71	'norwegian' => 'no',
	72	'pashto' => 'ps',
	73	'pidgin' => null,
	74	'polish' => 'pl',
	75	'portuguese' => 'pt',
	76	'romanian' => 'ro',
	77	'russian' => 'ru',
	78	'serbian' => 'sr',
	79	'slovak' => 'sk',
	80	'slovene' => 'sl',
	81	'somali' => 'so',
	82	'spanish' => 'es',
	83	'swahili' => 'sw',
	84	'swedish' => 'sv',
	85	'tagalog' => 'tl',
	86	'turkish' => 'tr',
	87	'ukrainian' => 'uk',
	88	'urdu' => 'ur',
	89	'uzbek' => 'uz',
	90	'vietnamese' => 'vi',
	91	'welsh' => 'cy',
	92	);
	93
	94	/**
	95	* Maps all language names from the language database to the
	96	* ISO 639-2 3-letter language code.
	97	*
	98	* @var array
	99	*/
	100	public static $nameToCode3 = array(
	101	'albanian' => 'sqi',
	102	'arabic' => 'ara',
	103	'azeri' => 'aze',
	104	'bengali' => 'ben',
	105	'bulgarian' => 'bul',
	106	'cebuano' => 'ceb',
	107	'croatian' => 'hrv',
	108	'czech' => 'ces',
	109	'danish' => 'dan',
	110	'dutch' => 'nld',
	111	'english' => 'eng',
	112	'estonian' => 'est',
	113	'farsi' => 'fas',
	114	'finnish' => 'fin',
	115	'french' => 'fra',
	116	'german' => 'deu',
	117	'hausa' => 'hau',
	118	'hawaiian' => 'haw',
	119	'hindi' => 'hin',
	120	'hungarian' => 'hun',
	121	'icelandic' => 'isl',
	122	'indonesian' => 'ind',
	123	'italian' => 'ita',
	124	'kazakh' => 'kaz',
	125	'kyrgyz' => 'kir',
	126	'latin' => 'lat',
	127	'latvian' => 'lav',
	128	'lithuanian' => 'lit',
	129	'macedonian' => 'mkd',
	130	'mongolian' => 'mon',
	131	'nepali' => 'nep',
	132	'norwegian' => 'nor',
	133	'pashto' => 'pus',
	134	'pidgin' => 'crp',
	135	'polish' => 'pol',
	136	'portuguese' => 'por',
	137	'romanian' => 'ron',
	138	'russian' => 'rus',
	139	'serbian' => 'srp',
	140	'slovak' => 'slk',
	141	'slovene' => 'slv',
	142	'somali' => 'som',
	143	'spanish' => 'spa',
	144	'swahili' => 'swa',
	145	'swedish' => 'swe',
	146	'tagalog' => 'tgl',
	147	'turkish' => 'tur',
	148	'ukrainian' => 'ukr',
	149	'urdu' => 'urd',
	150	'uzbek' => 'uzb',
	151	'vietnamese' => 'vie',
	152	'welsh' => 'cym',
	153	);
	154
	155	/**
	156	* Maps ISO 639-1 2-letter language codes to the language names
	157	* in the language database
	158	*
	159	* Not all languages have a 2 letter code, so some are missing
	160	*
	161	* @var array
	162	*/
	163	public static $code2ToName = array(
	164	'ar' => 'arabic',
	165	'az' => 'azeri',
	166	'bg' => 'bulgarian',
	167	'bn' => 'bengali',
	168	'cs' => 'czech',
	169	'cy' => 'welsh',
	170	'da' => 'danish',
	171	'de' => 'german',
	172	'en' => 'english',
	173	'es' => 'spanish',
	174	'et' => 'estonian',
	175	'fa' => 'farsi',
	176	'fi' => 'finnish',
	177	'fr' => 'french',
	178	'ha' => 'hausa',
	179	'hi' => 'hindi',
	180	'hr' => 'croatian',
	181	'hu' => 'hungarian',
	182	'id' => 'indonesian',
	183	'is' => 'icelandic',
	184	'it' => 'italian',
	185	'kk' => 'kazakh',
	186	'ky' => 'kyrgyz',
	187	'la' => 'latin',
	188	'lt' => 'lithuanian',
	189	'lv' => 'latvian',
	190	'mk' => 'macedonian',
	191	'mn' => 'mongolian',
	192	'ne' => 'nepali',
	193	'nl' => 'dutch',
	194	'no' => 'norwegian',
	195	'pl' => 'polish',
	196	'ps' => 'pashto',
	197	'pt' => 'portuguese',
	198	'ro' => 'romanian',
	199	'ru' => 'russian',
	200	'sk' => 'slovak',
	201	'sl' => 'slovene',
	202	'so' => 'somali',
	203	'sq' => 'albanian',
	204	'sr' => 'serbian',
	205	'sv' => 'swedish',
	206	'sw' => 'swahili',
	207	'tl' => 'tagalog',
	208	'tr' => 'turkish',
	209	'uk' => 'ukrainian',
	210	'ur' => 'urdu',
	211	'uz' => 'uzbek',
	212	'vi' => 'vietnamese',
	213	);
	214
	215	/**
	216	* Maps ISO 639-2 3-letter language codes to the language names
	217	* in the language database.
	218	*
	219	* @var array
	220	*/
	221	public static $code3ToName = array(
	222	'ara' => 'arabic',
	223	'aze' => 'azeri',
	224	'ben' => 'bengali',
	225	'bul' => 'bulgarian',
	226	'ceb' => 'cebuano',
	227	'ces' => 'czech',
	228	'crp' => 'pidgin',
	229	'cym' => 'welsh',
	230	'dan' => 'danish',
	231	'deu' => 'german',
	232	'eng' => 'english',
	233	'est' => 'estonian',
	234	'fas' => 'farsi',
	235	'fin' => 'finnish',
	236	'fra' => 'french',
	237	'hau' => 'hausa',
	238	'haw' => 'hawaiian',
	239	'hin' => 'hindi',
	240	'hrv' => 'croatian',
	241	'hun' => 'hungarian',
	242	'ind' => 'indonesian',
	243	'isl' => 'icelandic',
	244	'ita' => 'italian',
	245	'kaz' => 'kazakh',
	246	'kir' => 'kyrgyz',
	247	'lat' => 'latin',
	248	'lav' => 'latvian',
	249	'lit' => 'lithuanian',
	250	'mkd' => 'macedonian',
	251	'mon' => 'mongolian',
	252	'nep' => 'nepali',
	253	'nld' => 'dutch',
	254	'nor' => 'norwegian',
	255	'pol' => 'polish',
	256	'por' => 'portuguese',
	257	'pus' => 'pashto',
	258	'rom' => 'romanian',
	259	'rus' => 'russian',
	260	'slk' => 'slovak',
	261	'slv' => 'slovene',
	262	'som' => 'somali',
	263	'spa' => 'spanish',
	264	'sqi' => 'albanian',
	265	'srp' => 'serbian',
	266	'swa' => 'swahili',
	267	'swe' => 'swedish',
	268	'tgl' => 'tagalog',
	269	'tur' => 'turkish',
	270	'ukr' => 'ukrainian',
	271	'urd' => 'urdu',
	272	'uzb' => 'uzbek',
	273	'vie' => 'vietnamese',
	274	);
	275
	276	/**
	277	* Returns the 2-letter ISO 639-1 code for the given language name.
	278	*
	279	* @param string $lang English language name like "swedish"
	280	*
	281	* @return string Two-letter language code (e.g. "sv") or NULL if not found
	282	*/
	283	public static function nameToCode2($lang)
	284	{
	285	$lang = strtolower($lang);
	286	if (!isset(self::$nameToCode2[$lang])) {
	287	return null;
	288	}
	289	return self::$nameToCode2[$lang];
	290	}
	291
	292	/**
	293	* Returns the 3-letter ISO 639-2 code for the given language name.
	294	*
	295	* @param string $lang English language name like "swedish"
	296	*
	297	* @return string Three-letter language code (e.g. "swe") or NULL if not found
	298	*/
	299	public static function nameToCode3($lang)
	300	{
	301	$lang = strtolower($lang);
	302	if (!isset(self::$nameToCode3[$lang])) {
	303	return null;
	304	}
	305	return self::$nameToCode3[$lang];
	306	}
	307
	308	/**
	309	* Returns the language name for the given 2-letter ISO 639-1 code.
	310	*
	311	* @param string $code Two-letter language code (e.g. "sv")
	312	*
	313	* @return string English language name like "swedish"
	314	*/
	315	public static function code2ToName($code)
	316	{
	317	$lang = strtolower($code);
	318	if (!isset(self::$code2ToName[$code])) {
	319	return null;
	320	}
	321	return self::$code2ToName[$code];
	322	}
	323
	324	/**
	325	* Returns the language name for the given 3-letter ISO 639-2 code.
	326	*
	327	* @param string $code Three-letter language code (e.g. "swe")
	328	*
	329	* @return string English language name like "swedish"
	330	*/
	331	public static function code3ToName($code)
	332	{
	333	$lang = strtolower($code);
	334	if (!isset(self::$code3ToName[$code])) {
	335	return null;
	336	}
	337	return self::$code3ToName[$code];
	338	}
	339	} \ No newline at end of file


diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php new file mode 100644 index 00000000..fb0e1e20 --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -0,0 +1,347 @@
	1	<?php
	2
	3	/**
	4	* This class represents a text sample to be parsed.
	5	*
	6	* @category Text
	7	* @package Text_LanguageDetect
	8	* @author Nicholas Pisarro
	9	* @copyright 2006
	10	* @license BSD
	11	* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
	12	* @link http://pear.php.net/package/Text_LanguageDetect/
	13	* @link http://langdetect.blogspot.com/
	14	*/
	15
	16	/**
	17	* This class represents a text sample to be parsed.
	18	*
	19	* This separates the analysis of a text sample from the primary LanguageDetect
	20	* class. After a new profile has been built, the data can be retrieved using
	21	* the accessor functions.
	22	*
	23	* This class is intended to be used by the Text_LanguageDetect class, not
	24	* end-users.
	25	*
	26	* @category Text
	27	* @package Text_LanguageDetect
	28	* @author Nicholas Pisarro
	29	* @copyright 2006
	30	* @license BSD
	31	* @version release: 0.3.0
	32	*/
	33	class Text_LanguageDetect_Parser extends Text_LanguageDetect
	34	{
	35	/**
	36	* the piece of text being parsed
	37	*
	38	* @access private
	39	* @var string
	40	*/
	41	var $_string;
	42
	43	/**
	44	* stores the trigram frequencies of the sample
	45	*
	46	* @access private
	47	* @var string
	48	*/
	49	var $_trigrams = array();
	50
	51	/**
	52	* stores the trigram ranks of the sample
	53	*
	54	* @access private
	55	* @var array
	56	*/
	57	var $_trigram_ranks = array();
	58
	59	/**
	60	* stores the unicode blocks of the sample
	61	*
	62	* @access private
	63	* @var array
	64	*/
	65	var $_unicode_blocks = array();
	66
	67	/**
	68	* Whether the parser should compile the unicode ranges
	69	*
	70	* @access private
	71	* @var bool
	72	*/
	73	var $_compile_unicode = false;
	74
	75	/**
	76	* Whether the parser should compile trigrams
	77	*
	78	* @access private
	79	* @var bool
	80	*/
	81	var $_compile_trigram = false;
	82
	83	/**
	84	* Whether the trigram parser should pad the beginning of the string
	85	*
	86	* @access private
	87	* @var bool
	88	*/
	89	var $_trigram_pad_start = false;
	90
	91	/**
	92	* Whether the unicode parser should skip non-alphabetical ascii chars
	93	*
	94	* @access private
	95	* @var bool
	96	*/
	97	var $_unicode_skip_symbols = true;
	98
	99	/**
	100	* Constructor
	101	*
	102	* @access private
	103	* @param string $string string to be parsed
	104	*/
	105	function Text_LanguageDetect_Parser($string) {
	106	$this->_string = $string;
	107	}
	108
	109	/**
	110	* Returns true if a string is suitable for parsing
	111	*
	112	* @param string $str input string to test
	113	* @return bool true if acceptable, false if not
	114	*/
	115	public static function validateString($str) {
	116	if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
	117	return true;
	118	} else {
	119	return false;
	120	}
	121	}
	122
	123	/**
	124	* turn on/off trigram counting
	125	*
	126	* @access public
	127	* @param bool $bool true for on, false for off
	128	*/
	129	function prepareTrigram($bool = true)
	130	{
	131	$this->_compile_trigram = $bool;
	132	}
	133
	134	/**
	135	* turn on/off unicode block counting
	136	*
	137	* @access public
	138	* @param bool $bool true for on, false for off
	139	*/
	140	function prepareUnicode($bool = true)
	141	{
	142	$this->_compile_unicode = $bool;
	143	}
	144
	145	/**
	146	* turn on/off padding the beginning of the sample string
	147	*
	148	* @access public
	149	* @param bool $bool true for on, false for off
	150	*/
	151	function setPadStart($bool = true)
	152	{
	153	$this->_trigram_pad_start = $bool;
	154	}
	155
	156	/**
	157	* Should the unicode block counter skip non-alphabetical ascii chars?
	158	*
	159	* @access public
	160	* @param bool $bool true for on, false for off
	161	*/
	162	function setUnicodeSkipSymbols($bool = true)
	163	{
	164	$this->_unicode_skip_symbols = $bool;
	165	}
	166
	167	/**
	168	* Returns the trigram ranks for the text sample
	169	*
	170	* @access public
	171	* @return array trigram ranks in the text sample
	172	*/
	173	function &getTrigramRanks()
	174	{
	175	return $this->_trigram_ranks;
	176	}
	177
	178	/**
	179	* Return the trigram freqency table
	180	*
	181	* only used in testing to make sure the parser is working
	182	*
	183	* @access public
	184	* @return array trigram freqencies in the text sample
	185	*/
	186	function &getTrigramFreqs()
	187	{
	188	return $this->_trigram;
	189	}
	190
	191	/**
	192	* returns the array of unicode blocks
	193	*
	194	* @access public
	195	* @return array unicode blocks in the text sample
	196	*/
	197	function &getUnicodeBlocks()
	198	{
	199	return $this->_unicode_blocks;
	200	}
	201
	202	/**
	203	* Executes the parsing operation
	204	*
	205	* Be sure to call the set*() functions to set options and the
	206	* prepare*() functions first to tell it what kind of data to compute
	207	*
	208	* Afterwards the get*() functions can be used to access the compiled
	209	* information.
	210	*
	211	* @access public
	212	*/
	213	function analyze()
	214	{
	215	$len = strlen($this->_string);
	216	$byte_counter = 0;
	217
	218
	219	// unicode startup
	220	if ($this->_compile_unicode) {
	221	$blocks = $this->_read_unicode_block_db();
	222	$block_count = count($blocks);
	223
	224	$skipped_count = 0;
	225	$unicode_chars = array();
	226	}
	227
	228	// trigram startup
	229	if ($this->_compile_trigram) {
	230	// initialize them as blank so the parser will skip the first two
	231	// (since it skips trigrams with more than 2 contiguous spaces)
	232	$a = ' ';
	233	$b = ' ';
	234
	235	// kludge
	236	// if it finds a valid trigram to start and the start pad option is
	237	// off, then set a variable that will be used to reduce this
	238	// trigram after parsing has finished
	239	if (!$this->_trigram_pad_start) {
	240	$a = $this->_next_char($this->_string, $byte_counter, true);
	241
	242	if ($a != ' ') {
	243	$b = $this->_next_char($this->_string, $byte_counter, true);
	244	$dropone = " $a$b";
	245	}
	246
	247	$byte_counter = 0;
	248	$a = ' ';
	249	$b = ' ';
	250	}
	251	}
	252
	253	while ($byte_counter < $len) {
	254	$char = $this->_next_char($this->_string, $byte_counter, true);
	255
	256
	257	// language trigram detection
	258	if ($this->_compile_trigram) {
	259	if (!($b == ' ' && ($a == ' ' \|\| $char == ' '))) {
	260	if (!isset($this->_trigram[$a . $b . $char])) {
	261	$this->_trigram[$a . $b . $char] = 1;
	262	} else {
	263	$this->_trigram[$a . $b . $char]++;
	264	}
	265	}
	266
	267	$a = $b;
	268	$b = $char;
	269	}
	270
	271	// unicode block detection
	272	if ($this->_compile_unicode) {
	273	if ($this->_unicode_skip_symbols
	274	&& strlen($char) == 1
	275	&& ($char < 'A' \|\| $char > 'z'
	276	\|\| ($char > 'Z' && $char < 'a'))
	277	&& $char != "'") { // does not skip the apostrophe
	278	// since it's included in the language
	279	// models
	280
	281	$skipped_count++;
	282	continue;
	283	}
	284
	285	// build an array of all the characters
	286	if (isset($unicode_chars[$char])) {
	287	$unicode_chars[$char]++;
	288	} else {
	289	$unicode_chars[$char] = 1;
	290	}
	291	}
	292
	293	// todo: add byte detection here
	294	}
	295
	296	// unicode cleanup
	297	if ($this->_compile_unicode) {
	298	foreach ($unicode_chars as $utf8_char => $count) {
	299	$search_result = $this->_unicode_block_name(
	300	$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
	301
	302	if ($search_result != -1) {
	303	$block_name = $search_result[2];
	304	} else {
	305	$block_name = '[Malformatted]';
	306	}
	307
	308	if (isset($this->_unicode_blocks[$block_name])) {
	309	$this->_unicode_blocks[$block_name] += $count;
	310	} else {
	311	$this->_unicode_blocks[$block_name] = $count;
	312	}
	313	}
	314	}
	315
	316
	317	// trigram cleanup
	318	if ($this->_compile_trigram) {
	319	// pad the end
	320	if ($b != ' ') {
	321	if (!isset($this->_trigram["$a$b "])) {
	322	$this->_trigram["$a$b "] = 1;
	323	} else {
	324	$this->_trigram["$a$b "]++;
	325	}
	326	}
	327
	328	// perl compatibility; Language::Guess does not pad the beginning
	329	// kludge
	330	if (isset($dropone)) {
	331	if ($this->_trigram[$dropone] == 1) {
	332	unset($this->_trigram[$dropone]);
	333	} else {
	334	$this->_trigram[$dropone]--;
	335	}
	336	}
	337
	338	if (!empty($this->_trigram)) {
	339	$this->_trigram_ranks = $this->_arr_rank($this->_trigram);
	340	} else {
	341	$this->_trigram_ranks = array();
	342	}
	343	}
	344	}
	345	}
	346
	347	/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file