File indexing completed on 2024-12-22 05:36:20
0001 <?php 0002 0003 /** 0004 * Parser that uses PHP 5's DOM extension (part of the core). 0005 * 0006 * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. 0007 * It gives us a forgiving HTML parser, which we use to transform the HTML 0008 * into a DOM, and then into the tokens. It is blazingly fast (for large 0009 * documents, it performs twenty times faster than 0010 * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. 0011 * 0012 * @note Any empty elements will have empty tokens associated with them, even if 0013 * this is prohibited by the spec. This is cannot be fixed until the spec 0014 * comes into play. 0015 * 0016 * @note PHP's DOM extension does not actually parse any entities, we use 0017 * our own function to do that. 0018 * 0019 * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. 0020 * If this is a huge problem, due to the fact that HTML is hand 0021 * edited and you are unable to get a parser cache that caches the 0022 * the output of HTML Purifier while keeping the original HTML lying 0023 * around, you may want to run Tidy on the resulting output or use 0024 * HTMLPurifier_DirectLex 0025 */ 0026 0027 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer 0028 { 0029 0030 /** 0031 * @type HTMLPurifier_TokenFactory 0032 */ 0033 private $factory; 0034 0035 public function __construct() 0036 { 0037 // setup the factory 0038 parent::__construct(); 0039 $this->factory = new HTMLPurifier_TokenFactory(); 0040 } 0041 0042 /** 0043 * @param string $html 0044 * @param HTMLPurifier_Config $config 0045 * @param HTMLPurifier_Context $context 0046 * @return HTMLPurifier_Token[] 0047 */ 0048 public function tokenizeHTML($html, $config, $context) 0049 { 0050 $html = $this->normalize($html, $config, $context); 0051 0052 // attempt to armor stray angled brackets that cannot possibly 0053 // form tags and thus are probably being used as emoticons 0054 if ($config->get('Core.AggressivelyFixLt')) { 0055 $char = '[^a-z!\/]'; 0056 $comment = "/<!--(.*?)(-->|\z)/is"; 0057 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); 0058 do { 0059 $old = $html; 0060 $html = preg_replace("/<($char)/i", '<\\1', $html); 0061 } while ($html !== $old); 0062 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments 0063 } 0064 0065 // preprocess html, essential for UTF-8 0066 $html = $this->wrapHTML($html, $config, $context); 0067 0068 $doc = new DOMDocument(); 0069 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered 0070 0071 set_error_handler(array($this, 'muteErrorHandler')); 0072 $doc->loadHTML($html); 0073 restore_error_handler(); 0074 0075 $body = $doc->getElementsByTagName('html')->item(0)-> // <html> 0076 getElementsByTagName('body')->item(0); // <body> 0077 0078 $div = $body->getElementsByTagName('div')->item(0); // <div> 0079 $tokens = array(); 0080 $this->tokenizeDOM($div, $tokens, $config); 0081 // If the div has a sibling, that means we tripped across 0082 // a premature </div> tag. So remove the div we parsed, 0083 // and then tokenize the rest of body. We can't tokenize 0084 // the sibling directly as we'll lose the tags in that case. 0085 if ($div->nextSibling) { 0086 $body->removeChild($div); 0087 $this->tokenizeDOM($body, $tokens, $config); 0088 } 0089 return $tokens; 0090 } 0091 0092 /** 0093 * Iterative function that tokenizes a node, putting it into an accumulator. 0094 * To iterate is human, to recurse divine - L. Peter Deutsch 0095 * @param DOMNode $node DOMNode to be tokenized. 0096 * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. 0097 * @return HTMLPurifier_Token of node appended to previously passed tokens. 0098 */ 0099 protected function tokenizeDOM($node, &$tokens, $config) 0100 { 0101 $level = 0; 0102 $nodes = array($level => new HTMLPurifier_Queue(array($node))); 0103 $closingNodes = array(); 0104 do { 0105 while (!$nodes[$level]->isEmpty()) { 0106 $node = $nodes[$level]->shift(); // FIFO 0107 $collect = $level > 0 ? true : false; 0108 $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config); 0109 if ($needEndingTag) { 0110 $closingNodes[$level][] = $node; 0111 } 0112 if ($node->childNodes && $node->childNodes->length) { 0113 $level++; 0114 $nodes[$level] = new HTMLPurifier_Queue(); 0115 foreach ($node->childNodes as $childNode) { 0116 $nodes[$level]->push($childNode); 0117 } 0118 } 0119 } 0120 $level--; 0121 if ($level && isset($closingNodes[$level])) { 0122 while ($node = array_pop($closingNodes[$level])) { 0123 $this->createEndNode($node, $tokens); 0124 } 0125 } 0126 } while ($level > 0); 0127 } 0128 0129 /** 0130 * @param DOMNode $node DOMNode to be tokenized. 0131 * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. 0132 * @param bool $collect Says whether or start and close are collected, set to 0133 * false at first recursion because it's the implicit DIV 0134 * tag you're dealing with. 0135 * @return bool if the token needs an endtoken 0136 * @todo data and tagName properties don't seem to exist in DOMNode? 0137 */ 0138 protected function createStartNode($node, &$tokens, $collect, $config) 0139 { 0140 // intercept non element nodes. WE MUST catch all of them, 0141 // but we're not getting the character reference nodes because 0142 // those should have been preprocessed 0143 if ($node->nodeType === XML_TEXT_NODE) { 0144 $tokens[] = $this->factory->createText($node->data); 0145 return false; 0146 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { 0147 // undo libxml's special treatment of <script> and <style> tags 0148 $last = end($tokens); 0149 $data = $node->data; 0150 // (note $node->tagname is already normalized) 0151 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { 0152 $new_data = trim($data); 0153 if (substr($new_data, 0, 4) === '<!--') { 0154 $data = substr($new_data, 4); 0155 if (substr($data, -3) === '-->') { 0156 $data = substr($data, 0, -3); 0157 } else { 0158 // Highly suspicious! Not sure what to do... 0159 } 0160 } 0161 } 0162 $tokens[] = $this->factory->createText($this->parseText($data, $config)); 0163 return false; 0164 } elseif ($node->nodeType === XML_COMMENT_NODE) { 0165 // this is code is only invoked for comments in script/style in versions 0166 // of libxml pre-2.6.28 (regular comments, of course, are still 0167 // handled regularly) 0168 $tokens[] = $this->factory->createComment($node->data); 0169 return false; 0170 } elseif ($node->nodeType !== XML_ELEMENT_NODE) { 0171 // not-well tested: there may be other nodes we have to grab 0172 return false; 0173 } 0174 0175 $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); 0176 0177 // We still have to make sure that the element actually IS empty 0178 if (!$node->childNodes->length) { 0179 if ($collect) { 0180 $tokens[] = $this->factory->createEmpty($node->tagName, $attr); 0181 } 0182 return false; 0183 } else { 0184 if ($collect) { 0185 $tokens[] = $this->factory->createStart( 0186 $tag_name = $node->tagName, // somehow, it get's dropped 0187 $attr 0188 ); 0189 } 0190 return true; 0191 } 0192 } 0193 0194 /** 0195 * @param DOMNode $node 0196 * @param HTMLPurifier_Token[] $tokens 0197 */ 0198 protected function createEndNode($node, &$tokens) 0199 { 0200 $tokens[] = $this->factory->createEnd($node->tagName); 0201 } 0202 0203 0204 /** 0205 * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. 0206 * 0207 * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. 0208 * @return array Associative array of attributes. 0209 */ 0210 protected function transformAttrToAssoc($node_map) 0211 { 0212 // NamedNodeMap is documented very well, so we're using undocumented 0213 // features, namely, the fact that it implements Iterator and 0214 // has a ->length attribute 0215 if ($node_map->length === 0) { 0216 return array(); 0217 } 0218 $array = array(); 0219 foreach ($node_map as $attr) { 0220 $array[$attr->name] = $attr->value; 0221 } 0222 return $array; 0223 } 0224 0225 /** 0226 * An error handler that mutes all errors 0227 * @param int $errno 0228 * @param string $errstr 0229 */ 0230 public function muteErrorHandler($errno, $errstr) 0231 { 0232 } 0233 0234 /** 0235 * Callback function for undoing escaping of stray angled brackets 0236 * in comments 0237 * @param array $matches 0238 * @return string 0239 */ 0240 public function callbackUndoCommentSubst($matches) 0241 { 0242 return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; 0243 } 0244 0245 /** 0246 * Callback function that entity-izes ampersands in comments so that 0247 * callbackUndoCommentSubst doesn't clobber them 0248 * @param array $matches 0249 * @return string 0250 */ 0251 public function callbackArmorCommentEntities($matches) 0252 { 0253 return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; 0254 } 0255 0256 /** 0257 * Wraps an HTML fragment in the necessary HTML 0258 * @param string $html 0259 * @param HTMLPurifier_Config $config 0260 * @param HTMLPurifier_Context $context 0261 * @return string 0262 */ 0263 protected function wrapHTML($html, $config, $context, $use_div = true) 0264 { 0265 $def = $config->getDefinition('HTML'); 0266 $ret = ''; 0267 0268 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { 0269 $ret .= '<!DOCTYPE html '; 0270 if (!empty($def->doctype->dtdPublic)) { 0271 $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; 0272 } 0273 if (!empty($def->doctype->dtdSystem)) { 0274 $ret .= '"' . $def->doctype->dtdSystem . '" '; 0275 } 0276 $ret .= '>'; 0277 } 0278 0279 $ret .= '<html><head>'; 0280 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; 0281 // No protection if $html contains a stray </div>! 0282 $ret .= '</head><body>'; 0283 if ($use_div) $ret .= '<div>'; 0284 $ret .= $html; 0285 if ($use_div) $ret .= '</div>'; 0286 $ret .= '</body></html>'; 0287 return $ret; 0288 } 0289 } 0290 0291 // vim: et sw=4 sts=4