File indexing completed on 2024-12-22 05:36:20
0001 <?php 0002 0003 /** 0004 * Our in-house implementation of a parser. 0005 * 0006 * A pure PHP parser, DirectLex has absolutely no dependencies, making 0007 * it a reasonably good default for PHP4. Written with efficiency in mind, 0008 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it 0009 * pales in comparison to HTMLPurifier_Lexer_DOMLex. 0010 * 0011 * @todo Reread XML spec and document differences. 0012 */ 0013 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer 0014 { 0015 /** 0016 * @type bool 0017 */ 0018 public $tracksLineNumbers = true; 0019 0020 /** 0021 * Whitespace characters for str(c)spn. 0022 * @type string 0023 */ 0024 protected $_whitespace = "\x20\x09\x0D\x0A"; 0025 0026 /** 0027 * Callback function for script CDATA fudge 0028 * @param array $matches, in form of array(opening tag, contents, closing tag) 0029 * @return string 0030 */ 0031 protected function scriptCallback($matches) 0032 { 0033 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; 0034 } 0035 0036 /** 0037 * @param String $html 0038 * @param HTMLPurifier_Config $config 0039 * @param HTMLPurifier_Context $context 0040 * @return array|HTMLPurifier_Token[] 0041 */ 0042 public function tokenizeHTML($html, $config, $context) 0043 { 0044 // special normalization for script tags without any armor 0045 // our "armor" heurstic is a < sign any number of whitespaces after 0046 // the first script tag 0047 if ($config->get('HTML.Trusted')) { 0048 $html = preg_replace_callback( 0049 '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', 0050 array($this, 'scriptCallback'), 0051 $html 0052 ); 0053 } 0054 0055 $html = $this->normalize($html, $config, $context); 0056 0057 $cursor = 0; // our location in the text 0058 $inside_tag = false; // whether or not we're parsing the inside of a tag 0059 $array = array(); // result array 0060 0061 // This is also treated to mean maintain *column* numbers too 0062 $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); 0063 0064 if ($maintain_line_numbers === null) { 0065 // automatically determine line numbering by checking 0066 // if error collection is on 0067 $maintain_line_numbers = $config->get('Core.CollectErrors'); 0068 } 0069 0070 if ($maintain_line_numbers) { 0071 $current_line = 1; 0072 $current_col = 0; 0073 $length = strlen($html); 0074 } else { 0075 $current_line = false; 0076 $current_col = false; 0077 $length = false; 0078 } 0079 $context->register('CurrentLine', $current_line); 0080 $context->register('CurrentCol', $current_col); 0081 $nl = "\n"; 0082 // how often to manually recalculate. This will ALWAYS be right, 0083 // but it's pretty wasteful. Set to 0 to turn off 0084 $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); 0085 0086 $e = false; 0087 if ($config->get('Core.CollectErrors')) { 0088 $e =& $context->get('ErrorCollector'); 0089 } 0090 0091 // for testing synchronization 0092 $loops = 0; 0093 0094 while (++$loops) { 0095 // $cursor is either at the start of a token, or inside of 0096 // a tag (i.e. there was a < immediately before it), as indicated 0097 // by $inside_tag 0098 0099 if ($maintain_line_numbers) { 0100 // $rcursor, however, is always at the start of a token. 0101 $rcursor = $cursor - (int)$inside_tag; 0102 0103 // Column number is cheap, so we calculate it every round. 0104 // We're interested at the *end* of the newline string, so 0105 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it 0106 // from our "rcursor" position. 0107 $nl_pos = strrpos($html, $nl, $rcursor - $length); 0108 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); 0109 0110 // recalculate lines 0111 if ($synchronize_interval && // synchronization is on 0112 $cursor > 0 && // cursor is further than zero 0113 $loops % $synchronize_interval === 0) { // time to synchronize! 0114 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); 0115 } 0116 } 0117 0118 $position_next_lt = strpos($html, '<', $cursor); 0119 $position_next_gt = strpos($html, '>', $cursor); 0120 0121 // triggers on "<b>asdf</b>" but not "asdf <b></b>" 0122 // special case to set up context 0123 if ($position_next_lt === $cursor) { 0124 $inside_tag = true; 0125 $cursor++; 0126 } 0127 0128 if (!$inside_tag && $position_next_lt !== false) { 0129 // We are not inside tag and there still is another tag to parse 0130 $token = new 0131 HTMLPurifier_Token_Text( 0132 $this->parseText( 0133 substr( 0134 $html, 0135 $cursor, 0136 $position_next_lt - $cursor 0137 ), $config 0138 ) 0139 ); 0140 if ($maintain_line_numbers) { 0141 $token->rawPosition($current_line, $current_col); 0142 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); 0143 } 0144 $array[] = $token; 0145 $cursor = $position_next_lt + 1; 0146 $inside_tag = true; 0147 continue; 0148 } elseif (!$inside_tag) { 0149 // We are not inside tag but there are no more tags 0150 // If we're already at the end, break 0151 if ($cursor === strlen($html)) { 0152 break; 0153 } 0154 // Create Text of rest of string 0155 $token = new 0156 HTMLPurifier_Token_Text( 0157 $this->parseText( 0158 substr( 0159 $html, 0160 $cursor 0161 ), $config 0162 ) 0163 ); 0164 if ($maintain_line_numbers) { 0165 $token->rawPosition($current_line, $current_col); 0166 } 0167 $array[] = $token; 0168 break; 0169 } elseif ($inside_tag && $position_next_gt !== false) { 0170 // We are in tag and it is well formed 0171 // Grab the internals of the tag 0172 $strlen_segment = $position_next_gt - $cursor; 0173 0174 if ($strlen_segment < 1) { 0175 // there's nothing to process! 0176 $token = new HTMLPurifier_Token_Text('<'); 0177 $cursor++; 0178 continue; 0179 } 0180 0181 $segment = substr($html, $cursor, $strlen_segment); 0182 0183 if ($segment === false) { 0184 // somehow, we attempted to access beyond the end of 0185 // the string, defense-in-depth, reported by Nate Abele 0186 break; 0187 } 0188 0189 // Check if it's a comment 0190 if (substr($segment, 0, 3) === '!--') { 0191 // re-determine segment length, looking for --> 0192 $position_comment_end = strpos($html, '-->', $cursor); 0193 if ($position_comment_end === false) { 0194 // uh oh, we have a comment that extends to 0195 // infinity. Can't be helped: set comment 0196 // end position to end of string 0197 if ($e) { 0198 $e->send(E_WARNING, 'Lexer: Unclosed comment'); 0199 } 0200 $position_comment_end = strlen($html); 0201 $end = true; 0202 } else { 0203 $end = false; 0204 } 0205 $strlen_segment = $position_comment_end - $cursor; 0206 $segment = substr($html, $cursor, $strlen_segment); 0207 $token = new 0208 HTMLPurifier_Token_Comment( 0209 substr( 0210 $segment, 0211 3, 0212 $strlen_segment - 3 0213 ) 0214 ); 0215 if ($maintain_line_numbers) { 0216 $token->rawPosition($current_line, $current_col); 0217 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); 0218 } 0219 $array[] = $token; 0220 $cursor = $end ? $position_comment_end : $position_comment_end + 3; 0221 $inside_tag = false; 0222 continue; 0223 } 0224 0225 // Check if it's an end tag 0226 $is_end_tag = (strpos($segment, '/') === 0); 0227 if ($is_end_tag) { 0228 $type = substr($segment, 1); 0229 $token = new HTMLPurifier_Token_End($type); 0230 if ($maintain_line_numbers) { 0231 $token->rawPosition($current_line, $current_col); 0232 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 0233 } 0234 $array[] = $token; 0235 $inside_tag = false; 0236 $cursor = $position_next_gt + 1; 0237 continue; 0238 } 0239 0240 // Check leading character is alnum, if not, we may 0241 // have accidently grabbed an emoticon. Translate into 0242 // text and go our merry way 0243 if (!ctype_alpha($segment[0])) { 0244 // XML: $segment[0] !== '_' && $segment[0] !== ':' 0245 if ($e) { 0246 $e->send(E_NOTICE, 'Lexer: Unescaped lt'); 0247 } 0248 $token = new HTMLPurifier_Token_Text('<'); 0249 if ($maintain_line_numbers) { 0250 $token->rawPosition($current_line, $current_col); 0251 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 0252 } 0253 $array[] = $token; 0254 $inside_tag = false; 0255 continue; 0256 } 0257 0258 // Check if it is explicitly self closing, if so, remove 0259 // trailing slash. Remember, we could have a tag like <br>, so 0260 // any later token processing scripts must convert improperly 0261 // classified EmptyTags from StartTags. 0262 $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); 0263 if ($is_self_closing) { 0264 $strlen_segment--; 0265 $segment = substr($segment, 0, $strlen_segment); 0266 } 0267 0268 // Check if there are any attributes 0269 $position_first_space = strcspn($segment, $this->_whitespace); 0270 0271 if ($position_first_space >= $strlen_segment) { 0272 if ($is_self_closing) { 0273 $token = new HTMLPurifier_Token_Empty($segment); 0274 } else { 0275 $token = new HTMLPurifier_Token_Start($segment); 0276 } 0277 if ($maintain_line_numbers) { 0278 $token->rawPosition($current_line, $current_col); 0279 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 0280 } 0281 $array[] = $token; 0282 $inside_tag = false; 0283 $cursor = $position_next_gt + 1; 0284 continue; 0285 } 0286 0287 // Grab out all the data 0288 $type = substr($segment, 0, $position_first_space); 0289 $attribute_string = 0290 trim( 0291 substr( 0292 $segment, 0293 $position_first_space 0294 ) 0295 ); 0296 if ($attribute_string) { 0297 $attr = $this->parseAttributeString( 0298 $attribute_string, 0299 $config, 0300 $context 0301 ); 0302 } else { 0303 $attr = array(); 0304 } 0305 0306 if ($is_self_closing) { 0307 $token = new HTMLPurifier_Token_Empty($type, $attr); 0308 } else { 0309 $token = new HTMLPurifier_Token_Start($type, $attr); 0310 } 0311 if ($maintain_line_numbers) { 0312 $token->rawPosition($current_line, $current_col); 0313 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 0314 } 0315 $array[] = $token; 0316 $cursor = $position_next_gt + 1; 0317 $inside_tag = false; 0318 continue; 0319 } else { 0320 // inside tag, but there's no ending > sign 0321 if ($e) { 0322 $e->send(E_WARNING, 'Lexer: Missing gt'); 0323 } 0324 $token = new 0325 HTMLPurifier_Token_Text( 0326 '<' . 0327 $this->parseText( 0328 substr($html, $cursor), $config 0329 ) 0330 ); 0331 if ($maintain_line_numbers) { 0332 $token->rawPosition($current_line, $current_col); 0333 } 0334 // no cursor scroll? Hmm... 0335 $array[] = $token; 0336 break; 0337 } 0338 break; 0339 } 0340 0341 $context->destroy('CurrentLine'); 0342 $context->destroy('CurrentCol'); 0343 return $array; 0344 } 0345 0346 /** 0347 * PHP 5.0.x compatible substr_count that implements offset and length 0348 * @param string $haystack 0349 * @param string $needle 0350 * @param int $offset 0351 * @param int $length 0352 * @return int 0353 */ 0354 protected function substrCount($haystack, $needle, $offset, $length) 0355 { 0356 static $oldVersion; 0357 if ($oldVersion === null) { 0358 $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); 0359 } 0360 if ($oldVersion) { 0361 $haystack = substr($haystack, $offset, $length); 0362 return substr_count($haystack, $needle); 0363 } else { 0364 return substr_count($haystack, $needle, $offset, $length); 0365 } 0366 } 0367 0368 /** 0369 * Takes the inside of an HTML tag and makes an assoc array of attributes. 0370 * 0371 * @param string $string Inside of tag excluding name. 0372 * @param HTMLPurifier_Config $config 0373 * @param HTMLPurifier_Context $context 0374 * @return array Assoc array of attributes. 0375 */ 0376 public function parseAttributeString($string, $config, $context) 0377 { 0378 $string = (string)$string; // quick typecast 0379 0380 if ($string == '') { 0381 return array(); 0382 } // no attributes 0383 0384 $e = false; 0385 if ($config->get('Core.CollectErrors')) { 0386 $e =& $context->get('ErrorCollector'); 0387 } 0388 0389 // let's see if we can abort as quickly as possible 0390 // one equal sign, no spaces => one attribute 0391 $num_equal = substr_count($string, '='); 0392 $has_space = strpos($string, ' '); 0393 if ($num_equal === 0 && !$has_space) { 0394 // bool attribute 0395 return array($string => $string); 0396 } elseif ($num_equal === 1 && !$has_space) { 0397 // only one attribute 0398 list($key, $quoted_value) = explode('=', $string); 0399 $quoted_value = trim($quoted_value); 0400 if (!$key) { 0401 if ($e) { 0402 $e->send(E_ERROR, 'Lexer: Missing attribute key'); 0403 } 0404 return array(); 0405 } 0406 if (!$quoted_value) { 0407 return array($key => ''); 0408 } 0409 $first_char = @$quoted_value[0]; 0410 $last_char = @$quoted_value[strlen($quoted_value) - 1]; 0411 0412 $same_quote = ($first_char == $last_char); 0413 $open_quote = ($first_char == '"' || $first_char == "'"); 0414 0415 if ($same_quote && $open_quote) { 0416 // well behaved 0417 $value = substr($quoted_value, 1, strlen($quoted_value) - 2); 0418 } else { 0419 // not well behaved 0420 if ($open_quote) { 0421 if ($e) { 0422 $e->send(E_ERROR, 'Lexer: Missing end quote'); 0423 } 0424 $value = substr($quoted_value, 1); 0425 } else { 0426 $value = $quoted_value; 0427 } 0428 } 0429 if ($value === false) { 0430 $value = ''; 0431 } 0432 return array($key => $this->parseAttr($value, $config)); 0433 } 0434 0435 // setup loop environment 0436 $array = array(); // return assoc array of attributes 0437 $cursor = 0; // current position in string (moves forward) 0438 $size = strlen($string); // size of the string (stays the same) 0439 0440 // if we have unquoted attributes, the parser expects a terminating 0441 // space, so let's guarantee that there's always a terminating space. 0442 $string .= ' '; 0443 0444 $old_cursor = -1; 0445 while ($cursor < $size) { 0446 if ($old_cursor >= $cursor) { 0447 throw new Exception("Infinite loop detected"); 0448 } 0449 $old_cursor = $cursor; 0450 0451 $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); 0452 // grab the key 0453 0454 $key_begin = $cursor; //we're currently at the start of the key 0455 0456 // scroll past all characters that are the key (not whitespace or =) 0457 $cursor += strcspn($string, $this->_whitespace . '=', $cursor); 0458 0459 $key_end = $cursor; // now at the end of the key 0460 0461 $key = substr($string, $key_begin, $key_end - $key_begin); 0462 0463 if (!$key) { 0464 if ($e) { 0465 $e->send(E_ERROR, 'Lexer: Missing attribute key'); 0466 } 0467 $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop 0468 continue; // empty key 0469 } 0470 0471 // scroll past all whitespace 0472 $cursor += strspn($string, $this->_whitespace, $cursor); 0473 0474 if ($cursor >= $size) { 0475 $array[$key] = $key; 0476 break; 0477 } 0478 0479 // if the next character is an equal sign, we've got a regular 0480 // pair, otherwise, it's a bool attribute 0481 $first_char = @$string[$cursor]; 0482 0483 if ($first_char == '=') { 0484 // key="value" 0485 0486 $cursor++; 0487 $cursor += strspn($string, $this->_whitespace, $cursor); 0488 0489 if ($cursor === false) { 0490 $array[$key] = ''; 0491 break; 0492 } 0493 0494 // we might be in front of a quote right now 0495 0496 $char = @$string[$cursor]; 0497 0498 if ($char == '"' || $char == "'") { 0499 // it's quoted, end bound is $char 0500 $cursor++; 0501 $value_begin = $cursor; 0502 $cursor = strpos($string, $char, $cursor); 0503 $value_end = $cursor; 0504 } else { 0505 // it's not quoted, end bound is whitespace 0506 $value_begin = $cursor; 0507 $cursor += strcspn($string, $this->_whitespace, $cursor); 0508 $value_end = $cursor; 0509 } 0510 0511 // we reached a premature end 0512 if ($cursor === false) { 0513 $cursor = $size; 0514 $value_end = $cursor; 0515 } 0516 0517 $value = substr($string, $value_begin, $value_end - $value_begin); 0518 if ($value === false) { 0519 $value = ''; 0520 } 0521 $array[$key] = $this->parseAttr($value, $config); 0522 $cursor++; 0523 } else { 0524 // boolattr 0525 if ($key !== '') { 0526 $array[$key] = $key; 0527 } else { 0528 // purely theoretical 0529 if ($e) { 0530 $e->send(E_ERROR, 'Lexer: Missing attribute key'); 0531 } 0532 } 0533 } 0534 } 0535 return $array; 0536 } 0537 } 0538 0539 // vim: et sw=4 sts=4