File indexing completed on 2024-12-22 05:36:21
0001 <?php 0002 0003 /** 0004 * Forgivingly lexes HTML (SGML-style) markup into tokens. 0005 * 0006 * A lexer parses a string of SGML-style markup and converts them into 0007 * corresponding tokens. It doesn't check for well-formedness, although its 0008 * internal mechanism may make this automatic (such as the case of 0009 * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose 0010 * from. 0011 * 0012 * A lexer is HTML-oriented: it might work with XML, but it's not 0013 * recommended, as we adhere to a subset of the specification for optimization 0014 * reasons. This might change in the future. Also, most tokenizers are not 0015 * expected to handle DTDs or PIs. 0016 * 0017 * This class should not be directly instantiated, but you may use create() to 0018 * retrieve a default copy of the lexer. Being a supertype, this class 0019 * does not actually define any implementation, but offers commonly used 0020 * convenience functions for subclasses. 0021 * 0022 * @note The unit tests will instantiate this class for testing purposes, as 0023 * many of the utility functions require a class to be instantiated. 0024 * This means that, even though this class is not runnable, it will 0025 * not be declared abstract. 0026 * 0027 * @par 0028 * 0029 * @note 0030 * We use tokens rather than create a DOM representation because DOM would: 0031 * 0032 * @par 0033 * -# Require more processing and memory to create, 0034 * -# Is not streamable, and 0035 * -# Has the entire document structure (html and body not needed). 0036 * 0037 * @par 0038 * However, DOM is helpful in that it makes it easy to move around nodes 0039 * without a lot of lookaheads to see when a tag is closed. This is a 0040 * limitation of the token system and some workarounds would be nice. 0041 */ 0042 class HTMLPurifier_Lexer 0043 { 0044 0045 /** 0046 * Whether or not this lexer implements line-number/column-number tracking. 0047 * If it does, set to true. 0048 */ 0049 public $tracksLineNumbers = false; 0050 0051 // -- STATIC ---------------------------------------------------------- 0052 0053 /** 0054 * Retrieves or sets the default Lexer as a Prototype Factory. 0055 * 0056 * By default HTMLPurifier_Lexer_DOMLex will be returned. There are 0057 * a few exceptions involving special features that only DirectLex 0058 * implements. 0059 * 0060 * @note The behavior of this class has changed, rather than accepting 0061 * a prototype object, it now accepts a configuration object. 0062 * To specify your own prototype, set %Core.LexerImpl to it. 0063 * This change in behavior de-singletonizes the lexer object. 0064 * 0065 * @param HTMLPurifier_Config $config 0066 * @return HTMLPurifier_Lexer 0067 * @throws HTMLPurifier_Exception 0068 */ 0069 public static function create($config) 0070 { 0071 if (!($config instanceof HTMLPurifier_Config)) { 0072 $lexer = $config; 0073 trigger_error( 0074 "Passing a prototype to 0075 HTMLPurifier_Lexer::create() is deprecated, please instead 0076 use %Core.LexerImpl", 0077 E_USER_WARNING 0078 ); 0079 } else { 0080 $lexer = $config->get('Core.LexerImpl'); 0081 } 0082 0083 $needs_tracking = 0084 $config->get('Core.MaintainLineNumbers') || 0085 $config->get('Core.CollectErrors'); 0086 0087 $inst = null; 0088 if (is_object($lexer)) { 0089 $inst = $lexer; 0090 } else { 0091 if (is_null($lexer)) { 0092 do { 0093 // auto-detection algorithm 0094 if ($needs_tracking) { 0095 $lexer = 'DirectLex'; 0096 break; 0097 } 0098 0099 if (class_exists('DOMDocument', false) && 0100 method_exists('DOMDocument', 'loadHTML') && 0101 !extension_loaded('domxml') 0102 ) { 0103 // check for DOM support, because while it's part of the 0104 // core, it can be disabled compile time. Also, the PECL 0105 // domxml extension overrides the default DOM, and is evil 0106 // and nasty and we shan't bother to support it 0107 $lexer = 'DOMLex'; 0108 } else { 0109 $lexer = 'DirectLex'; 0110 } 0111 } while (0); 0112 } // do..while so we can break 0113 0114 // instantiate recognized string names 0115 switch ($lexer) { 0116 case 'DOMLex': 0117 $inst = new HTMLPurifier_Lexer_DOMLex(); 0118 break; 0119 case 'DirectLex': 0120 $inst = new HTMLPurifier_Lexer_DirectLex(); 0121 break; 0122 case 'PH5P': 0123 $inst = new HTMLPurifier_Lexer_PH5P(); 0124 break; 0125 default: 0126 throw new HTMLPurifier_Exception( 0127 "Cannot instantiate unrecognized Lexer type " . 0128 htmlspecialchars($lexer) 0129 ); 0130 } 0131 } 0132 0133 if (!$inst) { 0134 throw new HTMLPurifier_Exception('No lexer was instantiated'); 0135 } 0136 0137 // once PHP DOM implements native line numbers, or we 0138 // hack out something using XSLT, remove this stipulation 0139 if ($needs_tracking && !$inst->tracksLineNumbers) { 0140 throw new HTMLPurifier_Exception( 0141 'Cannot use lexer that does not support line numbers with ' . 0142 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 0143 ); 0144 } 0145 0146 return $inst; 0147 0148 } 0149 0150 // -- CONVENIENCE MEMBERS --------------------------------------------- 0151 0152 public function __construct() 0153 { 0154 $this->_entity_parser = new HTMLPurifier_EntityParser(); 0155 } 0156 0157 /** 0158 * Most common entity to raw value conversion table for special entities. 0159 * @type array 0160 */ 0161 protected $_special_entity2str = 0162 array( 0163 '"' => '"', 0164 '&' => '&', 0165 '<' => '<', 0166 '>' => '>', 0167 ''' => "'", 0168 ''' => "'", 0169 ''' => "'" 0170 ); 0171 0172 public function parseText($string, $config) { 0173 return $this->parseData($string, false, $config); 0174 } 0175 0176 public function parseAttr($string, $config) { 0177 return $this->parseData($string, true, $config); 0178 } 0179 0180 /** 0181 * Parses special entities into the proper characters. 0182 * 0183 * This string will translate escaped versions of the special characters 0184 * into the correct ones. 0185 * 0186 * @param string $string String character data to be parsed. 0187 * @return string Parsed character data. 0188 */ 0189 public function parseData($string, $is_attr, $config) 0190 { 0191 // following functions require at least one character 0192 if ($string === '') { 0193 return ''; 0194 } 0195 0196 // subtracts amps that cannot possibly be escaped 0197 $num_amp = substr_count($string, '&') - substr_count($string, '& ') - 0198 ($string[strlen($string) - 1] === '&' ? 1 : 0); 0199 0200 if (!$num_amp) { 0201 return $string; 0202 } // abort if no entities 0203 $num_esc_amp = substr_count($string, '&'); 0204 $string = strtr($string, $this->_special_entity2str); 0205 0206 // code duplication for sake of optimization, see above 0207 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - 0208 ($string[strlen($string) - 1] === '&' ? 1 : 0); 0209 0210 if ($num_amp_2 <= $num_esc_amp) { 0211 return $string; 0212 } 0213 0214 // hmm... now we have some uncommon entities. Use the callback. 0215 if ($config->get('Core.LegacyEntityDecoder')) { 0216 $string = $this->_entity_parser->substituteSpecialEntities($string); 0217 } else { 0218 if ($is_attr) { 0219 $string = $this->_entity_parser->substituteAttrEntities($string); 0220 } else { 0221 $string = $this->_entity_parser->substituteTextEntities($string); 0222 } 0223 } 0224 return $string; 0225 } 0226 0227 /** 0228 * Lexes an HTML string into tokens. 0229 * @param $string String HTML. 0230 * @param HTMLPurifier_Config $config 0231 * @param HTMLPurifier_Context $context 0232 * @return HTMLPurifier_Token[] array representation of HTML. 0233 */ 0234 public function tokenizeHTML($string, $config, $context) 0235 { 0236 trigger_error('Call to abstract class', E_USER_ERROR); 0237 } 0238 0239 /** 0240 * Translates CDATA sections into regular sections (through escaping). 0241 * @param string $string HTML string to process. 0242 * @return string HTML with CDATA sections escaped. 0243 */ 0244 protected static function escapeCDATA($string) 0245 { 0246 return preg_replace_callback( 0247 '/<!\[CDATA\[(.+?)\]\]>/s', 0248 array('HTMLPurifier_Lexer', 'CDATACallback'), 0249 $string 0250 ); 0251 } 0252 0253 /** 0254 * Special CDATA case that is especially convoluted for <script> 0255 * @param string $string HTML string to process. 0256 * @return string HTML with CDATA sections escaped. 0257 */ 0258 protected static function escapeCommentedCDATA($string) 0259 { 0260 return preg_replace_callback( 0261 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', 0262 array('HTMLPurifier_Lexer', 'CDATACallback'), 0263 $string 0264 ); 0265 } 0266 0267 /** 0268 * Special Internet Explorer conditional comments should be removed. 0269 * @param string $string HTML string to process. 0270 * @return string HTML with conditional comments removed. 0271 */ 0272 protected static function removeIEConditional($string) 0273 { 0274 return preg_replace( 0275 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings 0276 '', 0277 $string 0278 ); 0279 } 0280 0281 /** 0282 * Callback function for escapeCDATA() that does the work. 0283 * 0284 * @warning Though this is public in order to let the callback happen, 0285 * calling it directly is not recommended. 0286 * @param array $matches PCRE matches array, with index 0 the entire match 0287 * and 1 the inside of the CDATA section. 0288 * @return string Escaped internals of the CDATA section. 0289 */ 0290 protected static function CDATACallback($matches) 0291 { 0292 // not exactly sure why the character set is needed, but whatever 0293 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); 0294 } 0295 0296 /** 0297 * Takes a piece of HTML and normalizes it by converting entities, fixing 0298 * encoding, extracting bits, and other good stuff. 0299 * @param string $html HTML. 0300 * @param HTMLPurifier_Config $config 0301 * @param HTMLPurifier_Context $context 0302 * @return string 0303 * @todo Consider making protected 0304 */ 0305 public function normalize($html, $config, $context) 0306 { 0307 // normalize newlines to \n 0308 if ($config->get('Core.NormalizeNewlines')) { 0309 $html = str_replace("\r\n", "\n", $html); 0310 $html = str_replace("\r", "\n", $html); 0311 } 0312 0313 if ($config->get('HTML.Trusted')) { 0314 // escape convoluted CDATA 0315 $html = $this->escapeCommentedCDATA($html); 0316 } 0317 0318 // escape CDATA 0319 $html = $this->escapeCDATA($html); 0320 0321 $html = $this->removeIEConditional($html); 0322 0323 // extract body from document if applicable 0324 if ($config->get('Core.ConvertDocumentToFragment')) { 0325 $e = false; 0326 if ($config->get('Core.CollectErrors')) { 0327 $e =& $context->get('ErrorCollector'); 0328 } 0329 $new_html = $this->extractBody($html); 0330 if ($e && $new_html != $html) { 0331 $e->send(E_WARNING, 'Lexer: Extracted body'); 0332 } 0333 $html = $new_html; 0334 } 0335 0336 // expand entities that aren't the big five 0337 if ($config->get('Core.LegacyEntityDecoder')) { 0338 $html = $this->_entity_parser->substituteNonSpecialEntities($html); 0339 } 0340 0341 // clean into wellformed UTF-8 string for an SGML context: this has 0342 // to be done after entity expansion because the entities sometimes 0343 // represent non-SGML characters (horror, horror!) 0344 $html = HTMLPurifier_Encoder::cleanUTF8($html); 0345 0346 // if processing instructions are to removed, remove them now 0347 if ($config->get('Core.RemoveProcessingInstructions')) { 0348 $html = preg_replace('#<\?.+?\?>#s', '', $html); 0349 } 0350 0351 $hidden_elements = $config->get('Core.HiddenElements'); 0352 if ($config->get('Core.AggressivelyRemoveScript') && 0353 !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents') 0354 || empty($hidden_elements["script"]))) { 0355 $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html); 0356 } 0357 0358 return $html; 0359 } 0360 0361 /** 0362 * Takes a string of HTML (fragment or document) and returns the content 0363 * @todo Consider making protected 0364 */ 0365 public function extractBody($html) 0366 { 0367 $matches = array(); 0368 $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches); 0369 if ($result) { 0370 // Make sure it's not in a comment 0371 $comment_start = strrpos($matches[1], '<!--'); 0372 $comment_end = strrpos($matches[1], '-->'); 0373 if ($comment_start === false || 0374 ($comment_end !== false && $comment_end > $comment_start)) { 0375 return $matches[2]; 0376 } 0377 } 0378 return $html; 0379 } 0380 } 0381 0382 // vim: et sw=4 sts=4