File indexing completed on 2024-05-12 06:02:04

0001 <?php
0002 
0003 // why is this a top level function? Because PHP 5.2.0 doesn't seem to
0004 // understand how to interpret this filter if it's a static method.
0005 // It's all really silly, but if we go this route it might be reasonable
0006 // to coalesce all of these methods into one.
0007 function htmlpurifier_filter_extractstyleblocks_muteerrorhandler()
0008 {
0009 }
0010 
0011 /**
0012  * This filter extracts <style> blocks from input HTML, cleans them up
0013  * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
0014  * so they can be used elsewhere in the document.
0015  *
0016  * @note
0017  *      See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for
0018  *      sample usage.
0019  *
0020  * @note
0021  *      This filter can also be used on stylesheets not included in the
0022  *      document--something purists would probably prefer. Just directly
0023  *      call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS()
0024  */
0025 class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
0026 {
0027     /**
0028      * @type string
0029      */
0030     public $name = 'ExtractStyleBlocks';
0031 
0032     /**
0033      * @type array
0034      */
0035     private $_styleMatches = array();
0036 
0037     /**
0038      * @type csstidy
0039      */
0040     private $_tidy;
0041 
0042     /**
0043      * @type HTMLPurifier_AttrDef_HTML_ID
0044      */
0045     private $_id_attrdef;
0046 
0047     /**
0048      * @type HTMLPurifier_AttrDef_CSS_Ident
0049      */
0050     private $_class_attrdef;
0051 
0052     /**
0053      * @type HTMLPurifier_AttrDef_Enum
0054      */
0055     private $_enum_attrdef;
0056 
0057     public function __construct()
0058     {
0059         $this->_tidy = new csstidy();
0060         $this->_tidy->set_cfg('lowercase_s', false);
0061         $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
0062         $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
0063         $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(
0064             array(
0065                 'first-child',
0066                 'link',
0067                 'visited',
0068                 'active',
0069                 'hover',
0070                 'focus'
0071             )
0072         );
0073     }
0074 
0075     /**
0076      * Save the contents of CSS blocks to style matches
0077      * @param array $matches preg_replace style $matches array
0078      */
0079     protected function styleCallback($matches)
0080     {
0081         $this->_styleMatches[] = $matches[1];
0082     }
0083 
0084     /**
0085      * Removes inline <style> tags from HTML, saves them for later use
0086      * @param string $html
0087      * @param HTMLPurifier_Config $config
0088      * @param HTMLPurifier_Context $context
0089      * @return string
0090      * @todo Extend to indicate non-text/css style blocks
0091      */
0092     public function preFilter($html, $config, $context)
0093     {
0094         $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
0095         if ($tidy !== null) {
0096             $this->_tidy = $tidy;
0097         }
0098         // NB: this must be NON-greedy because if we have
0099         // <style>foo</style>  <style>bar</style>
0100         // we must not grab foo</style>  <style>bar
0101         $html = preg_replace_callback('#<style(?:\s.*)?>(.*)<\/style>#isU', array($this, 'styleCallback'), $html);
0102         $style_blocks = $this->_styleMatches;
0103         $this->_styleMatches = array(); // reset
0104         $context->register('StyleBlocks', $style_blocks); // $context must not be reused
0105         if ($this->_tidy) {
0106             foreach ($style_blocks as &$style) {
0107                 $style = $this->cleanCSS($style, $config, $context);
0108             }
0109         }
0110         return $html;
0111     }
0112 
0113     /**
0114      * Takes CSS (the stuff found in <style>) and cleans it.
0115      * @warning Requires CSSTidy <http://csstidy.sourceforge.net/>
0116      * @param string $css CSS styling to clean
0117      * @param HTMLPurifier_Config $config
0118      * @param HTMLPurifier_Context $context
0119      * @throws HTMLPurifier_Exception
0120      * @return string Cleaned CSS
0121      */
0122     public function cleanCSS($css, $config, $context)
0123     {
0124         // prepare scope
0125         $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
0126         if ($scope !== null) {
0127             $scopes = array_map('trim', explode(',', $scope));
0128         } else {
0129             $scopes = array();
0130         }
0131         // remove comments from CSS
0132         $css = trim($css);
0133         if (strncmp('<!--', $css, 4) === 0) {
0134             $css = substr($css, 4);
0135         }
0136         if (strlen($css) > 3 && substr($css, -3) == '-->') {
0137             $css = substr($css, 0, -3);
0138         }
0139         $css = trim($css);
0140         set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
0141         $this->_tidy->parse($css);
0142         restore_error_handler();
0143         $css_definition = $config->getDefinition('CSS');
0144         $html_definition = $config->getDefinition('HTML');
0145         $new_css = array();
0146         foreach ($this->_tidy->css as $k => $decls) {
0147             // $decls are all CSS declarations inside an @ selector
0148             $new_decls = array();
0149             foreach ($decls as $selector => $style) {
0150                 $selector = trim($selector);
0151                 if ($selector === '') {
0152                     continue;
0153                 } // should not happen
0154                 // Parse the selector
0155                 // Here is the relevant part of the CSS grammar:
0156                 //
0157                 // ruleset
0158                 //   : selector [ ',' S* selector ]* '{' ...
0159                 // selector
0160                 //   : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
0161                 // combinator
0162                 //   : '+' S*
0163                 //   : '>' S*
0164                 // simple_selector
0165                 //   : element_name [ HASH | class | attrib | pseudo ]*
0166                 //   | [ HASH | class | attrib | pseudo ]+
0167                 // element_name
0168                 //   : IDENT | '*'
0169                 //   ;
0170                 // class
0171                 //   : '.' IDENT
0172                 //   ;
0173                 // attrib
0174                 //   : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
0175                 //     [ IDENT | STRING ] S* ]? ']'
0176                 //   ;
0177                 // pseudo
0178                 //   : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
0179                 //   ;
0180                 //
0181                 // For reference, here are the relevant tokens:
0182                 //
0183                 // HASH         #{name}
0184                 // IDENT        {ident}
0185                 // INCLUDES     ==
0186                 // DASHMATCH    |=
0187                 // STRING       {string}
0188                 // FUNCTION     {ident}\(
0189                 //
0190                 // And the lexical scanner tokens
0191                 //
0192                 // name         {nmchar}+
0193                 // nmchar       [_a-z0-9-]|{nonascii}|{escape}
0194                 // nonascii     [\240-\377]
0195                 // escape       {unicode}|\\[^\r\n\f0-9a-f]
0196                 // unicode      \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
0197                 // ident        -?{nmstart}{nmchar*}
0198                 // nmstart      [_a-z]|{nonascii}|{escape}
0199                 // string       {string1}|{string2}
0200                 // string1      \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
0201                 // string2      \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
0202                 //
0203                 // We'll implement a subset (in order to reduce attack
0204                 // surface); in particular:
0205                 //
0206                 //      - No Unicode support
0207                 //      - No escapes support
0208                 //      - No string support (by proxy no attrib support)
0209                 //      - element_name is matched against allowed
0210                 //        elements (some people might find this
0211                 //        annoying...)
0212                 //      - Pseudo-elements one of :first-child, :link,
0213                 //        :visited, :active, :hover, :focus
0214 
0215                 // handle ruleset
0216                 $selectors = array_map('trim', explode(',', $selector));
0217                 $new_selectors = array();
0218                 foreach ($selectors as $sel) {
0219                     // split on +, > and spaces
0220                     $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
0221                     // even indices are chunks, odd indices are
0222                     // delimiters
0223                     $nsel = null;
0224                     $delim = null; // guaranteed to be non-null after
0225                     // two loop iterations
0226                     for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
0227                         $x = $basic_selectors[$i];
0228                         if ($i % 2) {
0229                             // delimiter
0230                             if ($x === ' ') {
0231                                 $delim = ' ';
0232                             } else {
0233                                 $delim = ' ' . $x . ' ';
0234                             }
0235                         } else {
0236                             // simple selector
0237                             $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
0238                             $sdelim = null;
0239                             $nx = null;
0240                             for ($j = 0, $cc = count($components); $j < $cc; $j++) {
0241                                 $y = $components[$j];
0242                                 if ($j === 0) {
0243                                     if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
0244                                         $nx = $y;
0245                                     } else {
0246                                         // $nx stays null; this matters
0247                                         // if we don't manage to find
0248                                         // any valid selector content,
0249                                         // in which case we ignore the
0250                                         // outer $delim
0251                                     }
0252                                 } elseif ($j % 2) {
0253                                     // set delimiter
0254                                     $sdelim = $y;
0255                                 } else {
0256                                     $attrdef = null;
0257                                     if ($sdelim === '#') {
0258                                         $attrdef = $this->_id_attrdef;
0259                                     } elseif ($sdelim === '.') {
0260                                         $attrdef = $this->_class_attrdef;
0261                                     } elseif ($sdelim === ':') {
0262                                         $attrdef = $this->_enum_attrdef;
0263                                     } else {
0264                                         throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
0265                                     }
0266                                     $r = $attrdef->validate($y, $config, $context);
0267                                     if ($r !== false) {
0268                                         if ($r !== true) {
0269                                             $y = $r;
0270                                         }
0271                                         if ($nx === null) {
0272                                             $nx = '';
0273                                         }
0274                                         $nx .= $sdelim . $y;
0275                                     }
0276                                 }
0277                             }
0278                             if ($nx !== null) {
0279                                 if ($nsel === null) {
0280                                     $nsel = $nx;
0281                                 } else {
0282                                     $nsel .= $delim . $nx;
0283                                 }
0284                             } else {
0285                                 // delimiters to the left of invalid
0286                                 // basic selector ignored
0287                             }
0288                         }
0289                     }
0290                     if ($nsel !== null) {
0291                         if (!empty($scopes)) {
0292                             foreach ($scopes as $s) {
0293                                 $new_selectors[] = "$s $nsel";
0294                             }
0295                         } else {
0296                             $new_selectors[] = $nsel;
0297                         }
0298                     }
0299                 }
0300                 if (empty($new_selectors)) {
0301                     continue;
0302                 }
0303                 $selector = implode(', ', $new_selectors);
0304                 foreach ($style as $name => $value) {
0305                     if (!isset($css_definition->info[$name])) {
0306                         unset($style[$name]);
0307                         continue;
0308                     }
0309                     $def = $css_definition->info[$name];
0310                     $ret = $def->validate($value, $config, $context);
0311                     if ($ret === false) {
0312                         unset($style[$name]);
0313                     } else {
0314                         $style[$name] = $ret;
0315                     }
0316                 }
0317                 $new_decls[$selector] = $style;
0318             }
0319             $new_css[$k] = $new_decls;
0320         }
0321         // remove stuff that shouldn't be used, could be reenabled
0322         // after security risks are analyzed
0323         $this->_tidy->css = $new_css;
0324         $this->_tidy->import = array();
0325         $this->_tidy->charset = null;
0326         $this->_tidy->namespace = null;
0327         $css = $this->_tidy->print->plain();
0328         // we are going to escape any special characters <>& to ensure
0329         // that no funny business occurs (i.e. </style> in a font-family prop).
0330         if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
0331             $css = str_replace(
0332                 array('<', '>', '&'),
0333                 array('\3C ', '\3E ', '\26 '),
0334                 $css
0335             );
0336         }
0337         return $css;
0338     }
0339 }
0340 
0341 // vim: et sw=4 sts=4