File indexing completed on 2025-01-05 05:24:24
0001 <?php 0002 0003 // why is this a top level function? Because PHP 5.2.0 doesn't seem to 0004 // understand how to interpret this filter if it's a static method. 0005 // It's all really silly, but if we go this route it might be reasonable 0006 // to coalesce all of these methods into one. 0007 function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() 0008 { 0009 } 0010 0011 /** 0012 * This filter extracts <style> blocks from input HTML, cleans them up 0013 * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks') 0014 * so they can be used elsewhere in the document. 0015 * 0016 * @note 0017 * See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for 0018 * sample usage. 0019 * 0020 * @note 0021 * This filter can also be used on stylesheets not included in the 0022 * document--something purists would probably prefer. Just directly 0023 * call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS() 0024 */ 0025 class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter 0026 { 0027 /** 0028 * @type string 0029 */ 0030 public $name = 'ExtractStyleBlocks'; 0031 0032 /** 0033 * @type array 0034 */ 0035 private $_styleMatches = array(); 0036 0037 /** 0038 * @type csstidy 0039 */ 0040 private $_tidy; 0041 0042 /** 0043 * @type HTMLPurifier_AttrDef_HTML_ID 0044 */ 0045 private $_id_attrdef; 0046 0047 /** 0048 * @type HTMLPurifier_AttrDef_CSS_Ident 0049 */ 0050 private $_class_attrdef; 0051 0052 /** 0053 * @type HTMLPurifier_AttrDef_Enum 0054 */ 0055 private $_enum_attrdef; 0056 0057 public function __construct() 0058 { 0059 $this->_tidy = new csstidy(); 0060 $this->_tidy->set_cfg('lowercase_s', false); 0061 $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); 0062 $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); 0063 $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum( 0064 array( 0065 'first-child', 0066 'link', 0067 'visited', 0068 'active', 0069 'hover', 0070 'focus' 0071 ) 0072 ); 0073 } 0074 0075 /** 0076 * Save the contents of CSS blocks to style matches 0077 * @param array $matches preg_replace style $matches array 0078 */ 0079 protected function styleCallback($matches) 0080 { 0081 $this->_styleMatches[] = $matches[1]; 0082 } 0083 0084 /** 0085 * Removes inline <style> tags from HTML, saves them for later use 0086 * @param string $html 0087 * @param HTMLPurifier_Config $config 0088 * @param HTMLPurifier_Context $context 0089 * @return string 0090 * @todo Extend to indicate non-text/css style blocks 0091 */ 0092 public function preFilter($html, $config, $context) 0093 { 0094 $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl'); 0095 if ($tidy !== null) { 0096 $this->_tidy = $tidy; 0097 } 0098 // NB: this must be NON-greedy because if we have 0099 // <style>foo</style> <style>bar</style> 0100 // we must not grab foo</style> <style>bar 0101 $html = preg_replace_callback('#<style(?:\s.*)?>(.*)<\/style>#isU', array($this, 'styleCallback'), $html); 0102 $style_blocks = $this->_styleMatches; 0103 $this->_styleMatches = array(); // reset 0104 $context->register('StyleBlocks', $style_blocks); // $context must not be reused 0105 if ($this->_tidy) { 0106 foreach ($style_blocks as &$style) { 0107 $style = $this->cleanCSS($style, $config, $context); 0108 } 0109 } 0110 return $html; 0111 } 0112 0113 /** 0114 * Takes CSS (the stuff found in <style>) and cleans it. 0115 * @warning Requires CSSTidy <http://csstidy.sourceforge.net/> 0116 * @param string $css CSS styling to clean 0117 * @param HTMLPurifier_Config $config 0118 * @param HTMLPurifier_Context $context 0119 * @throws HTMLPurifier_Exception 0120 * @return string Cleaned CSS 0121 */ 0122 public function cleanCSS($css, $config, $context) 0123 { 0124 // prepare scope 0125 $scope = $config->get('Filter.ExtractStyleBlocks.Scope'); 0126 if ($scope !== null) { 0127 $scopes = array_map('trim', explode(',', $scope)); 0128 } else { 0129 $scopes = array(); 0130 } 0131 // remove comments from CSS 0132 $css = trim($css); 0133 if (strncmp('<!--', $css, 4) === 0) { 0134 $css = substr($css, 4); 0135 } 0136 if (strlen($css) > 3 && substr($css, -3) == '-->') { 0137 $css = substr($css, 0, -3); 0138 } 0139 $css = trim($css); 0140 set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler'); 0141 $this->_tidy->parse($css); 0142 restore_error_handler(); 0143 $css_definition = $config->getDefinition('CSS'); 0144 $html_definition = $config->getDefinition('HTML'); 0145 $new_css = array(); 0146 foreach ($this->_tidy->css as $k => $decls) { 0147 // $decls are all CSS declarations inside an @ selector 0148 $new_decls = array(); 0149 foreach ($decls as $selector => $style) { 0150 $selector = trim($selector); 0151 if ($selector === '') { 0152 continue; 0153 } // should not happen 0154 // Parse the selector 0155 // Here is the relevant part of the CSS grammar: 0156 // 0157 // ruleset 0158 // : selector [ ',' S* selector ]* '{' ... 0159 // selector 0160 // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? 0161 // combinator 0162 // : '+' S* 0163 // : '>' S* 0164 // simple_selector 0165 // : element_name [ HASH | class | attrib | pseudo ]* 0166 // | [ HASH | class | attrib | pseudo ]+ 0167 // element_name 0168 // : IDENT | '*' 0169 // ; 0170 // class 0171 // : '.' IDENT 0172 // ; 0173 // attrib 0174 // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* 0175 // [ IDENT | STRING ] S* ]? ']' 0176 // ; 0177 // pseudo 0178 // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] 0179 // ; 0180 // 0181 // For reference, here are the relevant tokens: 0182 // 0183 // HASH #{name} 0184 // IDENT {ident} 0185 // INCLUDES == 0186 // DASHMATCH |= 0187 // STRING {string} 0188 // FUNCTION {ident}\( 0189 // 0190 // And the lexical scanner tokens 0191 // 0192 // name {nmchar}+ 0193 // nmchar [_a-z0-9-]|{nonascii}|{escape} 0194 // nonascii [\240-\377] 0195 // escape {unicode}|\\[^\r\n\f0-9a-f] 0196 // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? 0197 // ident -?{nmstart}{nmchar*} 0198 // nmstart [_a-z]|{nonascii}|{escape} 0199 // string {string1}|{string2} 0200 // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" 0201 // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' 0202 // 0203 // We'll implement a subset (in order to reduce attack 0204 // surface); in particular: 0205 // 0206 // - No Unicode support 0207 // - No escapes support 0208 // - No string support (by proxy no attrib support) 0209 // - element_name is matched against allowed 0210 // elements (some people might find this 0211 // annoying...) 0212 // - Pseudo-elements one of :first-child, :link, 0213 // :visited, :active, :hover, :focus 0214 0215 // handle ruleset 0216 $selectors = array_map('trim', explode(',', $selector)); 0217 $new_selectors = array(); 0218 foreach ($selectors as $sel) { 0219 // split on +, > and spaces 0220 $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); 0221 // even indices are chunks, odd indices are 0222 // delimiters 0223 $nsel = null; 0224 $delim = null; // guaranteed to be non-null after 0225 // two loop iterations 0226 for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { 0227 $x = $basic_selectors[$i]; 0228 if ($i % 2) { 0229 // delimiter 0230 if ($x === ' ') { 0231 $delim = ' '; 0232 } else { 0233 $delim = ' ' . $x . ' '; 0234 } 0235 } else { 0236 // simple selector 0237 $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); 0238 $sdelim = null; 0239 $nx = null; 0240 for ($j = 0, $cc = count($components); $j < $cc; $j++) { 0241 $y = $components[$j]; 0242 if ($j === 0) { 0243 if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { 0244 $nx = $y; 0245 } else { 0246 // $nx stays null; this matters 0247 // if we don't manage to find 0248 // any valid selector content, 0249 // in which case we ignore the 0250 // outer $delim 0251 } 0252 } elseif ($j % 2) { 0253 // set delimiter 0254 $sdelim = $y; 0255 } else { 0256 $attrdef = null; 0257 if ($sdelim === '#') { 0258 $attrdef = $this->_id_attrdef; 0259 } elseif ($sdelim === '.') { 0260 $attrdef = $this->_class_attrdef; 0261 } elseif ($sdelim === ':') { 0262 $attrdef = $this->_enum_attrdef; 0263 } else { 0264 throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); 0265 } 0266 $r = $attrdef->validate($y, $config, $context); 0267 if ($r !== false) { 0268 if ($r !== true) { 0269 $y = $r; 0270 } 0271 if ($nx === null) { 0272 $nx = ''; 0273 } 0274 $nx .= $sdelim . $y; 0275 } 0276 } 0277 } 0278 if ($nx !== null) { 0279 if ($nsel === null) { 0280 $nsel = $nx; 0281 } else { 0282 $nsel .= $delim . $nx; 0283 } 0284 } else { 0285 // delimiters to the left of invalid 0286 // basic selector ignored 0287 } 0288 } 0289 } 0290 if ($nsel !== null) { 0291 if (!empty($scopes)) { 0292 foreach ($scopes as $s) { 0293 $new_selectors[] = "$s $nsel"; 0294 } 0295 } else { 0296 $new_selectors[] = $nsel; 0297 } 0298 } 0299 } 0300 if (empty($new_selectors)) { 0301 continue; 0302 } 0303 $selector = implode(', ', $new_selectors); 0304 foreach ($style as $name => $value) { 0305 if (!isset($css_definition->info[$name])) { 0306 unset($style[$name]); 0307 continue; 0308 } 0309 $def = $css_definition->info[$name]; 0310 $ret = $def->validate($value, $config, $context); 0311 if ($ret === false) { 0312 unset($style[$name]); 0313 } else { 0314 $style[$name] = $ret; 0315 } 0316 } 0317 $new_decls[$selector] = $style; 0318 } 0319 $new_css[$k] = $new_decls; 0320 } 0321 // remove stuff that shouldn't be used, could be reenabled 0322 // after security risks are analyzed 0323 $this->_tidy->css = $new_css; 0324 $this->_tidy->import = array(); 0325 $this->_tidy->charset = null; 0326 $this->_tidy->namespace = null; 0327 $css = $this->_tidy->print->plain(); 0328 // we are going to escape any special characters <>& to ensure 0329 // that no funny business occurs (i.e. </style> in a font-family prop). 0330 if ($config->get('Filter.ExtractStyleBlocks.Escaping')) { 0331 $css = str_replace( 0332 array('<', '>', '&'), 0333 array('\3C ', '\3E ', '\26 '), 0334 $css 0335 ); 0336 } 0337 return $css; 0338 } 0339 } 0340 0341 // vim: et sw=4 sts=4