HTMLPurifier/Lexer/DirectLex.php

0001 <?php
0002
0003 /**
0004  * Our in-house implementation of a parser.
0005  *
0006  * A pure PHP parser, DirectLex has absolutely no dependencies, making
0007  * it a reasonably good default for PHP4.  Written with efficiency in mind,
0008  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
0009  * pales in comparison to HTMLPurifier_Lexer_DOMLex.
0010  *
0011  * @todo Reread XML spec and document differences.
0012  */
0013 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
0014 {
0015     /**
0016      * @type bool
0017      */
0018     public $tracksLineNumbers = true;
0019
0020     /**
0021      * Whitespace characters for str(c)spn.
0022      * @type string
0023      */
0024     protected $_whitespace = "\x20\x09\x0D\x0A";
0025
0026     /**
0027      * Callback function for script CDATA fudge
0028      * @param array $matches, in form of array(opening tag, contents, closing tag)
0029      * @return string
0030      */
0031     protected function scriptCallback($matches)
0032     {
0033         return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
0034     }
0035
0036     /**
0037      * @param String $html
0038      * @param HTMLPurifier_Config $config
0039      * @param HTMLPurifier_Context $context
0040      * @return array|HTMLPurifier_Token[]
0041      */
0042     public function tokenizeHTML($html, $config, $context)
0043     {
0044         // special normalization for script tags without any armor
0045         // our "armor" heurstic is a < sign any number of whitespaces after
0046         // the first script tag
0047         if ($config->get('HTML.Trusted')) {
0048             $html = preg_replace_callback(
0049                 '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
0050                 array($this, 'scriptCallback'),
0051                 $html
0052             );
0053         }
0054
0055         $html = $this->normalize($html, $config, $context);
0056
0057         $cursor = 0; // our location in the text
0058         $inside_tag = false; // whether or not we're parsing the inside of a tag
0059         $array = array(); // result array
0060
0061         // This is also treated to mean maintain *column* numbers too
0062         $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
0063
0064         if ($maintain_line_numbers === null) {
0065             // automatically determine line numbering by checking
0066             // if error collection is on
0067             $maintain_line_numbers = $config->get('Core.CollectErrors');
0068         }
0069
0070         if ($maintain_line_numbers) {
0071             $current_line = 1;
0072             $current_col = 0;
0073             $length = strlen($html);
0074         } else {
0075             $current_line = false;
0076             $current_col = false;
0077             $length = false;
0078         }
0079         $context->register('CurrentLine', $current_line);
0080         $context->register('CurrentCol', $current_col);
0081         $nl = "\n";
0082         // how often to manually recalculate. This will ALWAYS be right,
0083         // but it's pretty wasteful. Set to 0 to turn off
0084         $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
0085
0086         $e = false;
0087         if ($config->get('Core.CollectErrors')) {
0088             $e =& $context->get('ErrorCollector');
0089         }
0090
0091         // for testing synchronization
0092         $loops = 0;
0093
0094         while (++$loops) {
0095             // $cursor is either at the start of a token, or inside of
0096             // a tag (i.e. there was a < immediately before it), as indicated
0097             // by $inside_tag
0098
0099             if ($maintain_line_numbers) {
0100                 // $rcursor, however, is always at the start of a token.
0101                 $rcursor = $cursor - (int)$inside_tag;
0102
0103                 // Column number is cheap, so we calculate it every round.
0104                 // We're interested at the *end* of the newline string, so
0105                 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
0106                 // from our "rcursor" position.
0107                 $nl_pos = strrpos($html, $nl, $rcursor - $length);
0108                 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
0109
0110                 // recalculate lines
0111                 if ($synchronize_interval && // synchronization is on
0112                     $cursor > 0 && // cursor is further than zero
0113                     $loops % $synchronize_interval === 0) { // time to synchronize!
0114                     $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
0115                 }
0116             }
0117
0118             $position_next_lt = strpos($html, '<', $cursor);
0119             $position_next_gt = strpos($html, '>', $cursor);
0120
0121             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
0122             // special case to set up context
0123             if ($position_next_lt === $cursor) {
0124                 $inside_tag = true;
0125                 $cursor++;
0126             }
0127
0128             if (!$inside_tag && $position_next_lt !== false) {
0129                 // We are not inside tag and there still is another tag to parse
0130                 $token = new
0131                 HTMLPurifier_Token_Text(
0132                     $this->parseText(
0133                         substr(
0134                             $html,
0135                             $cursor,
0136                             $position_next_lt - $cursor
0137                         ), $config
0138                     )
0139                 );
0140                 if ($maintain_line_numbers) {
0141                     $token->rawPosition($current_line, $current_col);
0142                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
0143                 }
0144                 $array[] = $token;
0145                 $cursor = $position_next_lt + 1;
0146                 $inside_tag = true;
0147                 continue;
0148             } elseif (!$inside_tag) {
0149                 // We are not inside tag but there are no more tags
0150                 // If we're already at the end, break
0151                 if ($cursor === strlen($html)) {
0152                     break;
0153                 }
0154                 // Create Text of rest of string
0155                 $token = new
0156                 HTMLPurifier_Token_Text(
0157                     $this->parseText(
0158                         substr(
0159                             $html,
0160                             $cursor
0161                         ), $config
0162                     )
0163                 );
0164                 if ($maintain_line_numbers) {
0165                     $token->rawPosition($current_line, $current_col);
0166                 }
0167                 $array[] = $token;
0168                 break;
0169             } elseif ($inside_tag && $position_next_gt !== false) {
0170                 // We are in tag and it is well formed
0171                 // Grab the internals of the tag
0172                 $strlen_segment = $position_next_gt - $cursor;
0173
0174                 if ($strlen_segment < 1) {
0175                     // there's nothing to process!
0176                     $token = new HTMLPurifier_Token_Text('<');
0177                     $cursor++;
0178                     continue;
0179                 }
0180
0181                 $segment = substr($html, $cursor, $strlen_segment);
0182
0183                 if ($segment === false) {
0184                     // somehow, we attempted to access beyond the end of
0185                     // the string, defense-in-depth, reported by Nate Abele
0186                     break;
0187                 }
0188
0189                 // Check if it's a comment
0190                 if (substr($segment, 0, 3) === '!--') {
0191                     // re-determine segment length, looking for -->
0192                     $position_comment_end = strpos($html, '-->', $cursor);
0193                     if ($position_comment_end === false) {
0194                         // uh oh, we have a comment that extends to
0195                         // infinity. Can't be helped: set comment
0196                         // end position to end of string
0197                         if ($e) {
0198                             $e->send(E_WARNING, 'Lexer: Unclosed comment');
0199                         }
0200                         $position_comment_end = strlen($html);
0201                         $end = true;
0202                     } else {
0203                         $end = false;
0204                     }
0205                     $strlen_segment = $position_comment_end - $cursor;
0206                     $segment = substr($html, $cursor, $strlen_segment);
0207                     $token = new
0208                     HTMLPurifier_Token_Comment(
0209                         substr(
0210                             $segment,
0211                             3,
0212                             $strlen_segment - 3
0213                         )
0214                     );
0215                     if ($maintain_line_numbers) {
0216                         $token->rawPosition($current_line, $current_col);
0217                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
0218                     }
0219                     $array[] = $token;
0220                     $cursor = $end ? $position_comment_end : $position_comment_end + 3;
0221                     $inside_tag = false;
0222                     continue;
0223                 }
0224
0225                 // Check if it's an end tag
0226                 $is_end_tag = (strpos($segment, '/') === 0);
0227                 if ($is_end_tag) {
0228                     $type = substr($segment, 1);
0229                     $token = new HTMLPurifier_Token_End($type);
0230                     if ($maintain_line_numbers) {
0231                         $token->rawPosition($current_line, $current_col);
0232                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
0233                     }
0234                     $array[] = $token;
0235                     $inside_tag = false;
0236                     $cursor = $position_next_gt + 1;
0237                     continue;
0238                 }
0239
0240                 // Check leading character is alnum, if not, we may
0241                 // have accidently grabbed an emoticon. Translate into
0242                 // text and go our merry way
0243                 if (!ctype_alpha($segment[0])) {
0244                     // XML:  $segment[0] !== '_' && $segment[0] !== ':'
0245                     if ($e) {
0246                         $e->send(E_NOTICE, 'Lexer: Unescaped lt');
0247                     }
0248                     $token = new HTMLPurifier_Token_Text('<');
0249                     if ($maintain_line_numbers) {
0250                         $token->rawPosition($current_line, $current_col);
0251                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
0252                     }
0253                     $array[] = $token;
0254                     $inside_tag = false;
0255                     continue;
0256                 }
0257
0258                 // Check if it is explicitly self closing, if so, remove
0259                 // trailing slash. Remember, we could have a tag like <br>, so
0260                 // any later token processing scripts must convert improperly
0261                 // classified EmptyTags from StartTags.
0262                 $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
0263                 if ($is_self_closing) {
0264                     $strlen_segment--;
0265                     $segment = substr($segment, 0, $strlen_segment);
0266                 }
0267
0268                 // Check if there are any attributes
0269                 $position_first_space = strcspn($segment, $this->_whitespace);
0270
0271                 if ($position_first_space >= $strlen_segment) {
0272                     if ($is_self_closing) {
0273                         $token = new HTMLPurifier_Token_Empty($segment);
0274                     } else {
0275                         $token = new HTMLPurifier_Token_Start($segment);
0276                     }
0277                     if ($maintain_line_numbers) {
0278                         $token->rawPosition($current_line, $current_col);
0279                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
0280                     }
0281                     $array[] = $token;
0282                     $inside_tag = false;
0283                     $cursor = $position_next_gt + 1;
0284                     continue;
0285                 }
0286
0287                 // Grab out all the data
0288                 $type = substr($segment, 0, $position_first_space);
0289                 $attribute_string =
0290                     trim(
0291                         substr(
0292                             $segment,
0293                             $position_first_space
0294                         )
0295                     );
0296                 if ($attribute_string) {
0297                     $attr = $this->parseAttributeString(
0298                         $attribute_string,
0299                         $config,
0300                         $context
0301                     );
0302                 } else {
0303                     $attr = array();
0304                 }
0305
0306                 if ($is_self_closing) {
0307                     $token = new HTMLPurifier_Token_Empty($type, $attr);
0308                 } else {
0309                     $token = new HTMLPurifier_Token_Start($type, $attr);
0310                 }
0311                 if ($maintain_line_numbers) {
0312                     $token->rawPosition($current_line, $current_col);
0313                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
0314                 }
0315                 $array[] = $token;
0316                 $cursor = $position_next_gt + 1;
0317                 $inside_tag = false;
0318                 continue;
0319             } else {
0320                 // inside tag, but there's no ending > sign
0321                 if ($e) {
0322                     $e->send(E_WARNING, 'Lexer: Missing gt');
0323                 }
0324                 $token = new
0325                 HTMLPurifier_Token_Text(
0326                     '<' .
0327                     $this->parseText(
0328                         substr($html, $cursor), $config
0329                     )
0330                 );
0331                 if ($maintain_line_numbers) {
0332                     $token->rawPosition($current_line, $current_col);
0333                 }
0334                 // no cursor scroll? Hmm...
0335                 $array[] = $token;
0336                 break;
0337             }
0338             break;
0339         }
0340
0341         $context->destroy('CurrentLine');
0342         $context->destroy('CurrentCol');
0343         return $array;
0344     }
0345
0346     /**
0347      * PHP 5.0.x compatible substr_count that implements offset and length
0348      * @param string $haystack
0349      * @param string $needle
0350      * @param int $offset
0351      * @param int $length
0352      * @return int
0353      */
0354     protected function substrCount($haystack, $needle, $offset, $length)
0355     {
0356         static $oldVersion;
0357         if ($oldVersion === null) {
0358             $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
0359         }
0360         if ($oldVersion) {
0361             $haystack = substr($haystack, $offset, $length);
0362             return substr_count($haystack, $needle);
0363         } else {
0364             return substr_count($haystack, $needle, $offset, $length);
0365         }
0366     }
0367
0368     /**
0369      * Takes the inside of an HTML tag and makes an assoc array of attributes.
0370      *
0371      * @param string $string Inside of tag excluding name.
0372      * @param HTMLPurifier_Config $config
0373      * @param HTMLPurifier_Context $context
0374      * @return array Assoc array of attributes.
0375      */
0376     public function parseAttributeString($string, $config, $context)
0377     {
0378         $string = (string)$string; // quick typecast
0379
0380         if ($string == '') {
0381             return array();
0382         } // no attributes
0383
0384         $e = false;
0385         if ($config->get('Core.CollectErrors')) {
0386             $e =& $context->get('ErrorCollector');
0387         }
0388
0389         // let's see if we can abort as quickly as possible
0390         // one equal sign, no spaces => one attribute
0391         $num_equal = substr_count($string, '=');
0392         $has_space = strpos($string, ' ');
0393         if ($num_equal === 0 && !$has_space) {
0394             // bool attribute
0395             return array($string => $string);
0396         } elseif ($num_equal === 1 && !$has_space) {
0397             // only one attribute
0398             list($key, $quoted_value) = explode('=', $string);
0399             $quoted_value = trim($quoted_value);
0400             if (!$key) {
0401                 if ($e) {
0402                     $e->send(E_ERROR, 'Lexer: Missing attribute key');
0403                 }
0404                 return array();
0405             }
0406             if (!$quoted_value) {
0407                 return array($key => '');
0408             }
0409             $first_char = @$quoted_value[0];
0410             $last_char = @$quoted_value[strlen($quoted_value) - 1];
0411
0412             $same_quote = ($first_char == $last_char);
0413             $open_quote = ($first_char == '"' || $first_char == "'");
0414
0415             if ($same_quote && $open_quote) {
0416                 // well behaved
0417                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
0418             } else {
0419                 // not well behaved
0420                 if ($open_quote) {
0421                     if ($e) {
0422                         $e->send(E_ERROR, 'Lexer: Missing end quote');
0423                     }
0424                     $value = substr($quoted_value, 1);
0425                 } else {
0426                     $value = $quoted_value;
0427                 }
0428             }
0429             if ($value === false) {
0430                 $value = '';
0431             }
0432             return array($key => $this->parseAttr($value, $config));
0433         }
0434
0435         // setup loop environment
0436         $array = array(); // return assoc array of attributes
0437         $cursor = 0; // current position in string (moves forward)
0438         $size = strlen($string); // size of the string (stays the same)
0439
0440         // if we have unquoted attributes, the parser expects a terminating
0441         // space, so let's guarantee that there's always a terminating space.
0442         $string .= ' ';
0443
0444         $old_cursor = -1;
0445         while ($cursor < $size) {
0446             if ($old_cursor >= $cursor) {
0447                 throw new Exception("Infinite loop detected");
0448             }
0449             $old_cursor = $cursor;
0450
0451             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
0452             // grab the key
0453
0454             $key_begin = $cursor; //we're currently at the start of the key
0455
0456             // scroll past all characters that are the key (not whitespace or =)
0457             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
0458
0459             $key_end = $cursor; // now at the end of the key
0460
0461             $key = substr($string, $key_begin, $key_end - $key_begin);
0462
0463             if (!$key) {
0464                 if ($e) {
0465                     $e->send(E_ERROR, 'Lexer: Missing attribute key');
0466                 }
0467                 $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
0468                 continue; // empty key
0469             }
0470
0471             // scroll past all whitespace
0472             $cursor += strspn($string, $this->_whitespace, $cursor);
0473
0474             if ($cursor >= $size) {
0475                 $array[$key] = $key;
0476                 break;
0477             }
0478
0479             // if the next character is an equal sign, we've got a regular
0480             // pair, otherwise, it's a bool attribute
0481             $first_char = @$string[$cursor];
0482
0483             if ($first_char == '=') {
0484                 // key="value"
0485
0486                 $cursor++;
0487                 $cursor += strspn($string, $this->_whitespace, $cursor);
0488
0489                 if ($cursor === false) {
0490                     $array[$key] = '';
0491                     break;
0492                 }
0493
0494                 // we might be in front of a quote right now
0495
0496                 $char = @$string[$cursor];
0497
0498                 if ($char == '"' || $char == "'") {
0499                     // it's quoted, end bound is $char
0500                     $cursor++;
0501                     $value_begin = $cursor;
0502                     $cursor = strpos($string, $char, $cursor);
0503                     $value_end = $cursor;
0504                 } else {
0505                     // it's not quoted, end bound is whitespace
0506                     $value_begin = $cursor;
0507                     $cursor += strcspn($string, $this->_whitespace, $cursor);
0508                     $value_end = $cursor;
0509                 }
0510
0511                 // we reached a premature end
0512                 if ($cursor === false) {
0513                     $cursor = $size;
0514                     $value_end = $cursor;
0515                 }
0516
0517                 $value = substr($string, $value_begin, $value_end - $value_begin);
0518                 if ($value === false) {
0519                     $value = '';
0520                 }
0521                 $array[$key] = $this->parseAttr($value, $config);
0522                 $cursor++;
0523             } else {
0524                 // boolattr
0525                 if ($key !== '') {
0526                     $array[$key] = $key;
0527                 } else {
0528                     // purely theoretical
0529                     if ($e) {
0530                         $e->send(E_ERROR, 'Lexer: Missing attribute key');
0531                     }
0532                 }
0533             }
0534         }
0535         return $array;
0536     }
0537 }
0538
0539 // vim: et sw=4 sts=4