inc/3rdparty/libraries/language-detect/Parser.php

   1 <?php
   2
   3 /**
   4  * This class represents a text sample to be parsed.
   5  *
   6  * @category    Text
   7  * @package     Text_LanguageDetect
   8  * @author      Nicholas Pisarro
   9  * @copyright   2006
  10  * @license     BSD
  11  * @version     CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
  12  * @link        http://pear.php.net/package/Text_LanguageDetect/
  13  * @link        http://langdetect.blogspot.com/
  14  */
  15
  16 /**
  17  * This class represents a text sample to be parsed.
  18  *
  19  * This separates the analysis of a text sample from the primary LanguageDetect
  20  * class. After a new profile has been built, the data can be retrieved using
  21  * the accessor functions.
  22  *
  23  * This class is intended to be used by the Text_LanguageDetect class, not
  24  * end-users.
  25  *
  26  * @category    Text
  27  * @package     Text_LanguageDetect
  28  * @author      Nicholas Pisarro
  29  * @copyright   2006
  30  * @license     BSD
  31  * @version     release: 0.2.3
  32  */
  33 class Text_LanguageDetect_Parser extends Text_LanguageDetect
  34 {
  35     /**
  36      * the piece of text being parsed
  37      *
  38      * @access  private
  39      * @var     string
  40      */
  41     var $_string;
  42
  43     /**
  44      * stores the trigram frequencies of the sample
  45      *
  46      * @access  private
  47      * @var     string
  48      */
  49     var $_trigrams = array();
  50
  51     /**
  52      * stores the trigram ranks of the sample
  53      *
  54      * @access  private
  55      * @var     array
  56      */
  57     var $_trigram_ranks = array();
  58
  59     /**
  60      * stores the unicode blocks of the sample
  61      *
  62      * @access  private
  63      * @var     array
  64      */
  65     var $_unicode_blocks = array();
  66
  67     /**
  68      * Whether the parser should compile the unicode ranges
  69      *
  70      * @access  private
  71      * @var     bool
  72      */
  73     var $_compile_unicode = false;
  74
  75     /**
  76      * Whether the parser should compile trigrams
  77      *
  78      * @access  private
  79      * @var     bool
  80      */
  81     var $_compile_trigram = false;
  82
  83     /**
  84      * Whether the trigram parser should pad the beginning of the string
  85      *
  86      * @access  private
  87      * @var     bool
  88      */
  89     var $_trigram_pad_start = false;
  90
  91     /**
  92      * Whether the unicode parser should skip non-alphabetical ascii chars
  93      *
  94      * @access  private
  95      * @var     bool
  96      */
  97     var $_unicode_skip_symbols = true;
  98
  99     /**
 100      * Constructor
 101      *
 102      * @access  private
 103      * @param   string  $string     string to be parsed
 104      */
 105     function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
 106                 if (isset($db)) $this->_db_filename = $db;
 107                 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
 108         $this->_string = $string;
 109     }
 110
 111     /**
 112      * Returns true if a string is suitable for parsing
 113      *
 114      * @static
 115      * @access  public
 116      * @param   string  $str    input string to test
 117      * @return  bool            true if acceptable, false if not
 118      */
 119     function validateString($str) {
 120         if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
 121             return true;
 122         } else {
 123             return false;
 124         }
 125     }
 126
 127     /**
 128      * turn on/off trigram counting
 129      *
 130      * @access  public
 131      * @param   bool    $bool true for on, false for off
 132      */
 133     function prepareTrigram($bool = true)
 134     {
 135         $this->_compile_trigram = $bool;
 136     }
 137
 138     /**
 139      * turn on/off unicode block counting
 140      *
 141      * @access  public
 142      * @param   bool    $bool true for on, false for off
 143      */
 144     function prepareUnicode($bool = true)
 145     {
 146         $this->_compile_unicode = $bool;
 147     }
 148
 149     /**
 150      * turn on/off padding the beginning of the sample string
 151      *
 152      * @access  public
 153      * @param   bool    $bool true for on, false for off
 154      */
 155     function setPadStart($bool = true)
 156     {
 157         $this->_trigram_pad_start = $bool;
 158     }
 159
 160     /**
 161      * Should the unicode block counter skip non-alphabetical ascii chars?
 162      *
 163      * @access  public
 164      * @param   bool    $bool true for on, false for off
 165      */
 166     function setUnicodeSkipSymbols($bool = true)
 167     {
 168         $this->_unicode_skip_symbols = $bool;
 169     }
 170
 171     /**
 172      * Returns the trigram ranks for the text sample
 173      *
 174      * @access  public
 175      * @return  array    trigram ranks in the text sample
 176      */
 177     function &getTrigramRanks()
 178     {
 179         return $this->_trigram_ranks;
 180     }
 181
 182     /**
 183      * Return the trigram freqency table
 184      *
 185      * only used in testing to make sure the parser is working
 186      *
 187      * @access  public
 188      * @return  array    trigram freqencies in the text sample
 189      */
 190     function &getTrigramFreqs()
 191     {
 192         return $this->_trigram;
 193     }
 194
 195     /**
 196      * returns the array of unicode blocks
 197      *
 198      * @access  public
 199      * @return  array   unicode blocks in the text sample
 200      */
 201     function &getUnicodeBlocks()
 202     {
 203         return $this->_unicode_blocks;
 204     }
 205
 206     /**
 207      * Executes the parsing operation
 208      *
 209      * Be sure to call the set*() functions to set options and the
 210      * prepare*() functions first to tell it what kind of data to compute
 211      *
 212      * Afterwards the get*() functions can be used to access the compiled
 213      * information.
 214      *
 215      * @access public
 216      */
 217     function analyze()
 218     {
 219         $len = strlen($this->_string);
 220         $byte_counter = 0;
 221
 222
 223         // unicode startup
 224         if ($this->_compile_unicode) {
 225             $blocks =& $this->_read_unicode_block_db();
 226
 227             $block_count = count($blocks);
 228
 229             $skipped_count = 0;
 230             $unicode_chars = array();
 231         }
 232
 233         // trigram startup
 234         if ($this->_compile_trigram) {
 235             // initialize them as blank so the parser will skip the first two
 236             // (since it skips trigrams with more than  2 contiguous spaces)
 237             $a = ' ';
 238             $b = ' ';
 239
 240             // kludge
 241             // if it finds a valid trigram to start and the start pad option is
 242             // off, then set a variable that will be used to reduce this
 243             // trigram after parsing has finished
 244             if (!$this->_trigram_pad_start) {
 245                 $a = $this->_next_char($this->_string, $byte_counter, true);
 246
 247                 if ($a != ' ') {
 248                     $b = $this->_next_char($this->_string, $byte_counter, true);
 249                     $dropone = " $a$b";
 250                 }
 251
 252                 $byte_counter = 0;
 253                 $a = ' ';
 254                 $b = ' ';
 255             }
 256         }
 257
 258         while ($byte_counter < $len) {
 259             $char = $this->_next_char($this->_string, $byte_counter, true);
 260
 261
 262             // language trigram detection
 263             if ($this->_compile_trigram) {
 264                 if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
 265                     if (!isset($this->_trigram[$a . $b . $char])) {
 266                        $this->_trigram[$a . $b . $char] = 1;
 267                     } else {
 268                        $this->_trigram[$a . $b . $char]++;
 269                     }
 270                 }
 271
 272                 $a = $b;
 273                 $b = $char;
 274             }
 275
 276             // unicode block detection
 277             if ($this->_compile_unicode) {
 278                 if ($this->_unicode_skip_symbols
 279                         && strlen($char) == 1
 280                         && ($char < 'A' || $char > 'z'
 281                         || ($char > 'Z' && $char < 'a'))
 282                         && $char != "'") {  // does not skip the apostrophe
 283                                             // since it's included in the language
 284                                             // models
 285
 286                     $skipped_count++;
 287                     continue;
 288                 }
 289
 290                 // build an array of all the characters
 291                 if (isset($unicode_chars[$char])) {
 292                     $unicode_chars[$char]++;
 293                 } else {
 294                     $unicode_chars[$char] = 1;
 295                 }
 296             }
 297
 298             // todo: add byte detection here
 299         }
 300
 301         // unicode cleanup
 302         if ($this->_compile_unicode) {
 303             foreach ($unicode_chars as $utf8_char => $count) {
 304                 $search_result = $this->_unicode_block_name(
 305                         $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
 306
 307                 if ($search_result != -1) {
 308                     $block_name = $search_result[2];
 309                 } else {
 310                     $block_name = '[Malformatted]';
 311                 }
 312
 313                 if (isset($this->_unicode_blocks[$block_name])) {
 314                     $this->_unicode_blocks[$block_name] += $count;
 315                 } else {
 316                     $this->_unicode_blocks[$block_name] = $count;
 317                 }
 318             }
 319         }
 320
 321
 322         // trigram cleanup
 323         if ($this->_compile_trigram) {
 324             // pad the end
 325             if ($b != ' ') {
 326                 if (!isset($this->_trigram["$a$b "])) {
 327                     $this->_trigram["$a$b "] = 1;
 328                 } else {
 329                     $this->_trigram["$a$b "]++;
 330                 }
 331             }
 332
 333             // perl compatibility; Language::Guess does not pad the beginning
 334             // kludge
 335             if (isset($dropone)) {
 336                 if ($this->_trigram[$dropone] == 1) {
 337                     unset($this->_trigram[$dropone]);
 338                 } else {
 339                     $this->_trigram[$dropone]--;
 340                 }
 341             }
 342
 343             if (!empty($this->_trigram)) {
 344                 $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
 345             } else {
 346                 $this->_trigram_ranks = array();
 347             }
 348         }
 349     }
 350 }
 351
 352 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
 353
 354 ?>