inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php

   1 <?php
   2
   3 /**
   4  * Our in-house implementation of a parser.
   5  *
   6  * A pure PHP parser, DirectLex has absolutely no dependencies, making
   7  * it a reasonably good default for PHP4.  Written with efficiency in mind,
   8  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
   9  * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  10  *
  11  * @todo Reread XML spec and document differences.
  12  */
  13 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  14 {
  15     /**
  16      * @type bool
  17      */
  18     public $tracksLineNumbers = true;
  19
  20     /**
  21      * Whitespace characters for str(c)spn.
  22      * @type string
  23      */
  24     protected $_whitespace = "\x20\x09\x0D\x0A";
  25
  26     /**
  27      * Callback function for script CDATA fudge
  28      * @param array $matches, in form of array(opening tag, contents, closing tag)
  29      * @return string
  30      */
  31     protected function scriptCallback($matches)
  32     {
  33         return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  34     }
  35
  36     /**
  37      * @param String $html
  38      * @param HTMLPurifier_Config $config
  39      * @param HTMLPurifier_Context $context
  40      * @return array|HTMLPurifier_Token[]
  41      */
  42     public function tokenizeHTML($html, $config, $context)
  43     {
  44         // special normalization for script tags without any armor
  45         // our "armor" heurstic is a < sign any number of whitespaces after
  46         // the first script tag
  47         if ($config->get('HTML.Trusted')) {
  48             $html = preg_replace_callback(
  49                 '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  50                 array($this, 'scriptCallback'),
  51                 $html
  52             );
  53         }
  54
  55         $html = $this->normalize($html, $config, $context);
  56
  57         $cursor = 0; // our location in the text
  58         $inside_tag = false; // whether or not we're parsing the inside of a tag
  59         $array = array(); // result array
  60
  61         // This is also treated to mean maintain *column* numbers too
  62         $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  63
  64         if ($maintain_line_numbers === null) {
  65             // automatically determine line numbering by checking
  66             // if error collection is on
  67             $maintain_line_numbers = $config->get('Core.CollectErrors');
  68         }
  69
  70         if ($maintain_line_numbers) {
  71             $current_line = 1;
  72             $current_col = 0;
  73             $length = strlen($html);
  74         } else {
  75             $current_line = false;
  76             $current_col = false;
  77             $length = false;
  78         }
  79         $context->register('CurrentLine', $current_line);
  80         $context->register('CurrentCol', $current_col);
  81         $nl = "\n";
  82         // how often to manually recalculate. This will ALWAYS be right,
  83         // but it's pretty wasteful. Set to 0 to turn off
  84         $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  85
  86         $e = false;
  87         if ($config->get('Core.CollectErrors')) {
  88             $e =& $context->get('ErrorCollector');
  89         }
  90
  91         // for testing synchronization
  92         $loops = 0;
  93
  94         while (++$loops) {
  95             // $cursor is either at the start of a token, or inside of
  96             // a tag (i.e. there was a < immediately before it), as indicated
  97             // by $inside_tag
  98
  99             if ($maintain_line_numbers) {
 100                 // $rcursor, however, is always at the start of a token.
 101                 $rcursor = $cursor - (int)$inside_tag;
 102
 103                 // Column number is cheap, so we calculate it every round.
 104                 // We're interested at the *end* of the newline string, so
 105                 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
 106                 // from our "rcursor" position.
 107                 $nl_pos = strrpos($html, $nl, $rcursor - $length);
 108                 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
 109
 110                 // recalculate lines
 111                 if ($synchronize_interval && // synchronization is on
 112                     $cursor > 0 && // cursor is further than zero
 113                     $loops % $synchronize_interval === 0) { // time to synchronize!
 114                     $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 115                 }
 116             }
 117
 118             $position_next_lt = strpos($html, '<', $cursor);
 119             $position_next_gt = strpos($html, '>', $cursor);
 120
 121             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 122             // special case to set up context
 123             if ($position_next_lt === $cursor) {
 124                 $inside_tag = true;
 125                 $cursor++;
 126             }
 127
 128             if (!$inside_tag && $position_next_lt !== false) {
 129                 // We are not inside tag and there still is another tag to parse
 130                 $token = new
 131                 HTMLPurifier_Token_Text(
 132                     $this->parseData(
 133                         substr(
 134                             $html,
 135                             $cursor,
 136                             $position_next_lt - $cursor
 137                         )
 138                     )
 139                 );
 140                 if ($maintain_line_numbers) {
 141                     $token->rawPosition($current_line, $current_col);
 142                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 143                 }
 144                 $array[] = $token;
 145                 $cursor = $position_next_lt + 1;
 146                 $inside_tag = true;
 147                 continue;
 148             } elseif (!$inside_tag) {
 149                 // We are not inside tag but there are no more tags
 150                 // If we're already at the end, break
 151                 if ($cursor === strlen($html)) {
 152                     break;
 153                 }
 154                 // Create Text of rest of string
 155                 $token = new
 156                 HTMLPurifier_Token_Text(
 157                     $this->parseData(
 158                         substr(
 159                             $html,
 160                             $cursor
 161                         )
 162                     )
 163                 );
 164                 if ($maintain_line_numbers) {
 165                     $token->rawPosition($current_line, $current_col);
 166                 }
 167                 $array[] = $token;
 168                 break;
 169             } elseif ($inside_tag && $position_next_gt !== false) {
 170                 // We are in tag and it is well formed
 171                 // Grab the internals of the tag
 172                 $strlen_segment = $position_next_gt - $cursor;
 173
 174                 if ($strlen_segment < 1) {
 175                     // there's nothing to process!
 176                     $token = new HTMLPurifier_Token_Text('<');
 177                     $cursor++;
 178                     continue;
 179                 }
 180
 181                 $segment = substr($html, $cursor, $strlen_segment);
 182
 183                 if ($segment === false) {
 184                     // somehow, we attempted to access beyond the end of
 185                     // the string, defense-in-depth, reported by Nate Abele
 186                     break;
 187                 }
 188
 189                 // Check if it's a comment
 190                 if (substr($segment, 0, 3) === '!--') {
 191                     // re-determine segment length, looking for -->
 192                     $position_comment_end = strpos($html, '-->', $cursor);
 193                     if ($position_comment_end === false) {
 194                         // uh oh, we have a comment that extends to
 195                         // infinity. Can't be helped: set comment
 196                         // end position to end of string
 197                         if ($e) {
 198                             $e->send(E_WARNING, 'Lexer: Unclosed comment');
 199                         }
 200                         $position_comment_end = strlen($html);
 201                         $end = true;
 202                     } else {
 203                         $end = false;
 204                     }
 205                     $strlen_segment = $position_comment_end - $cursor;
 206                     $segment = substr($html, $cursor, $strlen_segment);
 207                     $token = new
 208                     HTMLPurifier_Token_Comment(
 209                         substr(
 210                             $segment,
 211                             3,
 212                             $strlen_segment - 3
 213                         )
 214                     );
 215                     if ($maintain_line_numbers) {
 216                         $token->rawPosition($current_line, $current_col);
 217                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 218                     }
 219                     $array[] = $token;
 220                     $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 221                     $inside_tag = false;
 222                     continue;
 223                 }
 224
 225                 // Check if it's an end tag
 226                 $is_end_tag = (strpos($segment, '/') === 0);
 227                 if ($is_end_tag) {
 228                     $type = substr($segment, 1);
 229                     $token = new HTMLPurifier_Token_End($type);
 230                     if ($maintain_line_numbers) {
 231                         $token->rawPosition($current_line, $current_col);
 232                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 233                     }
 234                     $array[] = $token;
 235                     $inside_tag = false;
 236                     $cursor = $position_next_gt + 1;
 237                     continue;
 238                 }
 239
 240                 // Check leading character is alnum, if not, we may
 241                 // have accidently grabbed an emoticon. Translate into
 242                 // text and go our merry way
 243                 if (!ctype_alpha($segment[0])) {
 244                     // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 245                     if ($e) {
 246                         $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 247                     }
 248                     $token = new HTMLPurifier_Token_Text('<');
 249                     if ($maintain_line_numbers) {
 250                         $token->rawPosition($current_line, $current_col);
 251                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 252                     }
 253                     $array[] = $token;
 254                     $inside_tag = false;
 255                     continue;
 256                 }
 257
 258                 // Check if it is explicitly self closing, if so, remove
 259                 // trailing slash. Remember, we could have a tag like <br>, so
 260                 // any later token processing scripts must convert improperly
 261                 // classified EmptyTags from StartTags.
 262                 $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
 263                 if ($is_self_closing) {
 264                     $strlen_segment--;
 265                     $segment = substr($segment, 0, $strlen_segment);
 266                 }
 267
 268                 // Check if there are any attributes
 269                 $position_first_space = strcspn($segment, $this->_whitespace);
 270
 271                 if ($position_first_space >= $strlen_segment) {
 272                     if ($is_self_closing) {
 273                         $token = new HTMLPurifier_Token_Empty($segment);
 274                     } else {
 275                         $token = new HTMLPurifier_Token_Start($segment);
 276                     }
 277                     if ($maintain_line_numbers) {
 278                         $token->rawPosition($current_line, $current_col);
 279                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 280                     }
 281                     $array[] = $token;
 282                     $inside_tag = false;
 283                     $cursor = $position_next_gt + 1;
 284                     continue;
 285                 }
 286
 287                 // Grab out all the data
 288                 $type = substr($segment, 0, $position_first_space);
 289                 $attribute_string =
 290                     trim(
 291                         substr(
 292                             $segment,
 293                             $position_first_space
 294                         )
 295                     );
 296                 if ($attribute_string) {
 297                     $attr = $this->parseAttributeString(
 298                         $attribute_string,
 299                         $config,
 300                         $context
 301                     );
 302                 } else {
 303                     $attr = array();
 304                 }
 305
 306                 if ($is_self_closing) {
 307                     $token = new HTMLPurifier_Token_Empty($type, $attr);
 308                 } else {
 309                     $token = new HTMLPurifier_Token_Start($type, $attr);
 310                 }
 311                 if ($maintain_line_numbers) {
 312                     $token->rawPosition($current_line, $current_col);
 313                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 314                 }
 315                 $array[] = $token;
 316                 $cursor = $position_next_gt + 1;
 317                 $inside_tag = false;
 318                 continue;
 319             } else {
 320                 // inside tag, but there's no ending > sign
 321                 if ($e) {
 322                     $e->send(E_WARNING, 'Lexer: Missing gt');
 323                 }
 324                 $token = new
 325                 HTMLPurifier_Token_Text(
 326                     '<' .
 327                     $this->parseData(
 328                         substr($html, $cursor)
 329                     )
 330                 );
 331                 if ($maintain_line_numbers) {
 332                     $token->rawPosition($current_line, $current_col);
 333                 }
 334                 // no cursor scroll? Hmm...
 335                 $array[] = $token;
 336                 break;
 337             }
 338             break;
 339         }
 340
 341         $context->destroy('CurrentLine');
 342         $context->destroy('CurrentCol');
 343         return $array;
 344     }
 345
 346     /**
 347      * PHP 5.0.x compatible substr_count that implements offset and length
 348      * @param string $haystack
 349      * @param string $needle
 350      * @param int $offset
 351      * @param int $length
 352      * @return int
 353      */
 354     protected function substrCount($haystack, $needle, $offset, $length)
 355     {
 356         static $oldVersion;
 357         if ($oldVersion === null) {
 358             $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 359         }
 360         if ($oldVersion) {
 361             $haystack = substr($haystack, $offset, $length);
 362             return substr_count($haystack, $needle);
 363         } else {
 364             return substr_count($haystack, $needle, $offset, $length);
 365         }
 366     }
 367
 368     /**
 369      * Takes the inside of an HTML tag and makes an assoc array of attributes.
 370      *
 371      * @param string $string Inside of tag excluding name.
 372      * @param HTMLPurifier_Config $config
 373      * @param HTMLPurifier_Context $context
 374      * @return array Assoc array of attributes.
 375      */
 376     public function parseAttributeString($string, $config, $context)
 377     {
 378         $string = (string)$string; // quick typecast
 379
 380         if ($string == '') {
 381             return array();
 382         } // no attributes
 383
 384         $e = false;
 385         if ($config->get('Core.CollectErrors')) {
 386             $e =& $context->get('ErrorCollector');
 387         }
 388
 389         // let's see if we can abort as quickly as possible
 390         // one equal sign, no spaces => one attribute
 391         $num_equal = substr_count($string, '=');
 392         $has_space = strpos($string, ' ');
 393         if ($num_equal === 0 && !$has_space) {
 394             // bool attribute
 395             return array($string => $string);
 396         } elseif ($num_equal === 1 && !$has_space) {
 397             // only one attribute
 398             list($key, $quoted_value) = explode('=', $string);
 399             $quoted_value = trim($quoted_value);
 400             if (!$key) {
 401                 if ($e) {
 402                     $e->send(E_ERROR, 'Lexer: Missing attribute key');
 403                 }
 404                 return array();
 405             }
 406             if (!$quoted_value) {
 407                 return array($key => '');
 408             }
 409             $first_char = @$quoted_value[0];
 410             $last_char = @$quoted_value[strlen($quoted_value) - 1];
 411
 412             $same_quote = ($first_char == $last_char);
 413             $open_quote = ($first_char == '"' || $first_char == "'");
 414
 415             if ($same_quote && $open_quote) {
 416                 // well behaved
 417                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 418             } else {
 419                 // not well behaved
 420                 if ($open_quote) {
 421                     if ($e) {
 422                         $e->send(E_ERROR, 'Lexer: Missing end quote');
 423                     }
 424                     $value = substr($quoted_value, 1);
 425                 } else {
 426                     $value = $quoted_value;
 427                 }
 428             }
 429             if ($value === false) {
 430                 $value = '';
 431             }
 432             return array($key => $this->parseData($value));
 433         }
 434
 435         // setup loop environment
 436         $array = array(); // return assoc array of attributes
 437         $cursor = 0; // current position in string (moves forward)
 438         $size = strlen($string); // size of the string (stays the same)
 439
 440         // if we have unquoted attributes, the parser expects a terminating
 441         // space, so let's guarantee that there's always a terminating space.
 442         $string .= ' ';
 443
 444         $old_cursor = -1;
 445         while ($cursor < $size) {
 446             if ($old_cursor >= $cursor) {
 447                 throw new Exception("Infinite loop detected");
 448             }
 449             $old_cursor = $cursor;
 450
 451             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 452             // grab the key
 453
 454             $key_begin = $cursor; //we're currently at the start of the key
 455
 456             // scroll past all characters that are the key (not whitespace or =)
 457             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 458
 459             $key_end = $cursor; // now at the end of the key
 460
 461             $key = substr($string, $key_begin, $key_end - $key_begin);
 462
 463             if (!$key) {
 464                 if ($e) {
 465                     $e->send(E_ERROR, 'Lexer: Missing attribute key');
 466                 }
 467                 $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 468                 continue; // empty key
 469             }
 470
 471             // scroll past all whitespace
 472             $cursor += strspn($string, $this->_whitespace, $cursor);
 473
 474             if ($cursor >= $size) {
 475                 $array[$key] = $key;
 476                 break;
 477             }
 478
 479             // if the next character is an equal sign, we've got a regular
 480             // pair, otherwise, it's a bool attribute
 481             $first_char = @$string[$cursor];
 482
 483             if ($first_char == '=') {
 484                 // key="value"
 485
 486                 $cursor++;
 487                 $cursor += strspn($string, $this->_whitespace, $cursor);
 488
 489                 if ($cursor === false) {
 490                     $array[$key] = '';
 491                     break;
 492                 }
 493
 494                 // we might be in front of a quote right now
 495
 496                 $char = @$string[$cursor];
 497
 498                 if ($char == '"' || $char == "'") {
 499                     // it's quoted, end bound is $char
 500                     $cursor++;
 501                     $value_begin = $cursor;
 502                     $cursor = strpos($string, $char, $cursor);
 503                     $value_end = $cursor;
 504                 } else {
 505                     // it's not quoted, end bound is whitespace
 506                     $value_begin = $cursor;
 507                     $cursor += strcspn($string, $this->_whitespace, $cursor);
 508                     $value_end = $cursor;
 509                 }
 510
 511                 // we reached a premature end
 512                 if ($cursor === false) {
 513                     $cursor = $size;
 514                     $value_end = $cursor;
 515                 }
 516
 517                 $value = substr($string, $value_begin, $value_end - $value_begin);
 518                 if ($value === false) {
 519                     $value = '';
 520                 }
 521                 $array[$key] = $this->parseData($value);
 522                 $cursor++;
 523             } else {
 524                 // boolattr
 525                 if ($key !== '') {
 526                     $array[$key] = $key;
 527                 } else {
 528                     // purely theoretical
 529                     if ($e) {
 530                         $e->send(E_ERROR, 'Lexer: Missing attribute key');
 531                     }
 532                 }
 533             }
 534         }
 535         return $array;
 536     }
 537 }
 538
 539 // vim: et sw=4 sts=4