HTMLPurifier/Lexer/DOMLex.php

0001 <?php
0002
0003 /**
0004  * Parser that uses PHP 5's DOM extension (part of the core).
0005  *
0006  * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
0007  * It gives us a forgiving HTML parser, which we use to transform the HTML
0008  * into a DOM, and then into the tokens.  It is blazingly fast (for large
0009  * documents, it performs twenty times faster than
0010  * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
0011  *
0012  * @note Any empty elements will have empty tokens associated with them, even if
0013  * this is prohibited by the spec. This is cannot be fixed until the spec
0014  * comes into play.
0015  *
0016  * @note PHP's DOM extension does not actually parse any entities, we use
0017  *       our own function to do that.
0018  *
0019  * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
0020  *          If this is a huge problem, due to the fact that HTML is hand
0021  *          edited and you are unable to get a parser cache that caches the
0022  *          the output of HTML Purifier while keeping the original HTML lying
0023  *          around, you may want to run Tidy on the resulting output or use
0024  *          HTMLPurifier_DirectLex
0025  */
0026
0027 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
0028 {
0029
0030     /**
0031      * @type HTMLPurifier_TokenFactory
0032      */
0033     private $factory;
0034
0035     public function __construct()
0036     {
0037         // setup the factory
0038         parent::__construct();
0039         $this->factory = new HTMLPurifier_TokenFactory();
0040     }
0041
0042     /**
0043      * @param string $html
0044      * @param HTMLPurifier_Config $config
0045      * @param HTMLPurifier_Context $context
0046      * @return HTMLPurifier_Token[]
0047      */
0048     public function tokenizeHTML($html, $config, $context)
0049     {
0050         $html = $this->normalize($html, $config, $context);
0051
0052         // attempt to armor stray angled brackets that cannot possibly
0053         // form tags and thus are probably being used as emoticons
0054         if ($config->get('Core.AggressivelyFixLt')) {
0055             $char = '[^a-z!\/]';
0056             $comment = "/<!--(.*?)(-->|\z)/is";
0057             $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
0058             do {
0059                 $old = $html;
0060                 $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
0061             } while ($html !== $old);
0062             $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
0063         }
0064
0065         // preprocess html, essential for UTF-8
0066         $html = $this->wrapHTML($html, $config, $context);
0067
0068         $doc = new DOMDocument();
0069         $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
0070
0071         set_error_handler(array($this, 'muteErrorHandler'));
0072         $doc->loadHTML($html);
0073         restore_error_handler();
0074
0075         $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
0076                       getElementsByTagName('body')->item(0);  // <body>
0077
0078         $div = $body->getElementsByTagName('div')->item(0); // <div>
0079         $tokens = array();
0080         $this->tokenizeDOM($div, $tokens, $config);
0081         // If the div has a sibling, that means we tripped across
0082         // a premature </div> tag.  So remove the div we parsed,
0083         // and then tokenize the rest of body.  We can't tokenize
0084         // the sibling directly as we'll lose the tags in that case.
0085         if ($div->nextSibling) {
0086             $body->removeChild($div);
0087             $this->tokenizeDOM($body, $tokens, $config);
0088         }
0089         return $tokens;
0090     }
0091
0092     /**
0093      * Iterative function that tokenizes a node, putting it into an accumulator.
0094      * To iterate is human, to recurse divine - L. Peter Deutsch
0095      * @param DOMNode $node DOMNode to be tokenized.
0096      * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
0097      * @return HTMLPurifier_Token of node appended to previously passed tokens.
0098      */
0099     protected function tokenizeDOM($node, &$tokens, $config)
0100     {
0101         $level = 0;
0102         $nodes = array($level => new HTMLPurifier_Queue(array($node)));
0103         $closingNodes = array();
0104         do {
0105             while (!$nodes[$level]->isEmpty()) {
0106                 $node = $nodes[$level]->shift(); // FIFO
0107                 $collect = $level > 0 ? true : false;
0108                 $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
0109                 if ($needEndingTag) {
0110                     $closingNodes[$level][] = $node;
0111                 }
0112                 if ($node->childNodes && $node->childNodes->length) {
0113                     $level++;
0114                     $nodes[$level] = new HTMLPurifier_Queue();
0115                     foreach ($node->childNodes as $childNode) {
0116                         $nodes[$level]->push($childNode);
0117                     }
0118                 }
0119             }
0120             $level--;
0121             if ($level && isset($closingNodes[$level])) {
0122                 while ($node = array_pop($closingNodes[$level])) {
0123                     $this->createEndNode($node, $tokens);
0124                 }
0125             }
0126         } while ($level > 0);
0127     }
0128
0129     /**
0130      * @param DOMNode $node DOMNode to be tokenized.
0131      * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
0132      * @param bool $collect  Says whether or start and close are collected, set to
0133      *                    false at first recursion because it's the implicit DIV
0134      *                    tag you're dealing with.
0135      * @return bool if the token needs an endtoken
0136      * @todo data and tagName properties don't seem to exist in DOMNode?
0137      */
0138     protected function createStartNode($node, &$tokens, $collect, $config)
0139     {
0140         // intercept non element nodes. WE MUST catch all of them,
0141         // but we're not getting the character reference nodes because
0142         // those should have been preprocessed
0143         if ($node->nodeType === XML_TEXT_NODE) {
0144             $tokens[] = $this->factory->createText($node->data);
0145             return false;
0146         } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
0147             // undo libxml's special treatment of <script> and <style> tags
0148             $last = end($tokens);
0149             $data = $node->data;
0150             // (note $node->tagname is already normalized)
0151             if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
0152                 $new_data = trim($data);
0153                 if (substr($new_data, 0, 4) === '<!--') {
0154                     $data = substr($new_data, 4);
0155                     if (substr($data, -3) === '-->') {
0156                         $data = substr($data, 0, -3);
0157                     } else {
0158                         // Highly suspicious! Not sure what to do...
0159                     }
0160                 }
0161             }
0162             $tokens[] = $this->factory->createText($this->parseText($data, $config));
0163             return false;
0164         } elseif ($node->nodeType === XML_COMMENT_NODE) {
0165             // this is code is only invoked for comments in script/style in versions
0166             // of libxml pre-2.6.28 (regular comments, of course, are still
0167             // handled regularly)
0168             $tokens[] = $this->factory->createComment($node->data);
0169             return false;
0170         } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
0171             // not-well tested: there may be other nodes we have to grab
0172             return false;
0173         }
0174
0175         $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
0176
0177         // We still have to make sure that the element actually IS empty
0178         if (!$node->childNodes->length) {
0179             if ($collect) {
0180                 $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
0181             }
0182             return false;
0183         } else {
0184             if ($collect) {
0185                 $tokens[] = $this->factory->createStart(
0186                     $tag_name = $node->tagName, // somehow, it get's dropped
0187                     $attr
0188                 );
0189             }
0190             return true;
0191         }
0192     }
0193
0194     /**
0195      * @param DOMNode $node
0196      * @param HTMLPurifier_Token[] $tokens
0197      */
0198     protected function createEndNode($node, &$tokens)
0199     {
0200         $tokens[] = $this->factory->createEnd($node->tagName);
0201     }
0202
0203
0204     /**
0205      * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
0206      *
0207      * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
0208      * @return array Associative array of attributes.
0209      */
0210     protected function transformAttrToAssoc($node_map)
0211     {
0212         // NamedNodeMap is documented very well, so we're using undocumented
0213         // features, namely, the fact that it implements Iterator and
0214         // has a ->length attribute
0215         if ($node_map->length === 0) {
0216             return array();
0217         }
0218         $array = array();
0219         foreach ($node_map as $attr) {
0220             $array[$attr->name] = $attr->value;
0221         }
0222         return $array;
0223     }
0224
0225     /**
0226      * An error handler that mutes all errors
0227      * @param int $errno
0228      * @param string $errstr
0229      */
0230     public function muteErrorHandler($errno, $errstr)
0231     {
0232     }
0233
0234     /**
0235      * Callback function for undoing escaping of stray angled brackets
0236      * in comments
0237      * @param array $matches
0238      * @return string
0239      */
0240     public function callbackUndoCommentSubst($matches)
0241     {
0242         return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
0243     }
0244
0245     /**
0246      * Callback function that entity-izes ampersands in comments so that
0247      * callbackUndoCommentSubst doesn't clobber them
0248      * @param array $matches
0249      * @return string
0250      */
0251     public function callbackArmorCommentEntities($matches)
0252     {
0253         return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
0254     }
0255
0256     /**
0257      * Wraps an HTML fragment in the necessary HTML
0258      * @param string $html
0259      * @param HTMLPurifier_Config $config
0260      * @param HTMLPurifier_Context $context
0261      * @return string
0262      */
0263     protected function wrapHTML($html, $config, $context, $use_div = true)
0264     {
0265         $def = $config->getDefinition('HTML');
0266         $ret = '';
0267
0268         if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
0269             $ret .= '<!DOCTYPE html ';
0270             if (!empty($def->doctype->dtdPublic)) {
0271                 $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
0272             }
0273             if (!empty($def->doctype->dtdSystem)) {
0274                 $ret .= '"' . $def->doctype->dtdSystem . '" ';
0275             }
0276             $ret .= '>';
0277         }
0278
0279         $ret .= '<html><head>';
0280         $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
0281         // No protection if $html contains a stray </div>!
0282         $ret .= '</head><body>';
0283         if ($use_div) $ret .= '<div>';
0284         $ret .= $html;
0285         if ($use_div) $ret .= '</div>';
0286         $ret .= '</body></html>';
0287         return $ret;
0288     }
0289 }
0290
0291 // vim: et sw=4 sts=4