File indexing completed on 2024-12-22 05:36:21
0001 <?php 0002 0003 // if want to implement error collecting here, we'll need to use some sort 0004 // of global data (probably trigger_error) because it's impossible to pass 0005 // $config or $context to the callback functions. 0006 0007 /** 0008 * Handles referencing and derefencing character entities 0009 */ 0010 class HTMLPurifier_EntityParser 0011 { 0012 0013 /** 0014 * Reference to entity lookup table. 0015 * @type HTMLPurifier_EntityLookup 0016 */ 0017 protected $_entity_lookup; 0018 0019 /** 0020 * Callback regex string for entities in text. 0021 * @type string 0022 */ 0023 protected $_textEntitiesRegex; 0024 0025 /** 0026 * Callback regex string for entities in attributes. 0027 * @type string 0028 */ 0029 protected $_attrEntitiesRegex; 0030 0031 /** 0032 * Tests if the beginning of a string is a semi-optional regex 0033 */ 0034 protected $_semiOptionalPrefixRegex; 0035 0036 public function __construct() { 0037 // From 0038 // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon 0039 $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml"; 0040 0041 // NB: three empty captures to put the fourth match in the right 0042 // place 0043 $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/"; 0044 0045 $this->_textEntitiesRegex = 0046 '/&(?:'. 0047 // hex 0048 '[#]x([a-fA-F0-9]+);?|'. 0049 // dec 0050 '[#]0*(\d+);?|'. 0051 // string (mandatory semicolon) 0052 // NB: order matters: match semicolon preferentially 0053 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. 0054 // string (optional semicolon) 0055 "($semi_optional)". 0056 ')/'; 0057 0058 $this->_attrEntitiesRegex = 0059 '/&(?:'. 0060 // hex 0061 '[#]x([a-fA-F0-9]+);?|'. 0062 // dec 0063 '[#]0*(\d+);?|'. 0064 // string (mandatory semicolon) 0065 // NB: order matters: match semicolon preferentially 0066 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. 0067 // string (optional semicolon) 0068 // don't match if trailing is equals or alphanumeric (URL 0069 // like) 0070 "($semi_optional)(?![=;A-Za-z0-9])". 0071 ')/'; 0072 0073 } 0074 0075 /** 0076 * Substitute entities with the parsed equivalents. Use this on 0077 * textual data in an HTML document (as opposed to attributes.) 0078 * 0079 * @param string $string String to have entities parsed. 0080 * @return string Parsed string. 0081 */ 0082 public function substituteTextEntities($string) 0083 { 0084 return preg_replace_callback( 0085 $this->_textEntitiesRegex, 0086 array($this, 'entityCallback'), 0087 $string 0088 ); 0089 } 0090 0091 /** 0092 * Substitute entities with the parsed equivalents. Use this on 0093 * attribute contents in documents. 0094 * 0095 * @param string $string String to have entities parsed. 0096 * @return string Parsed string. 0097 */ 0098 public function substituteAttrEntities($string) 0099 { 0100 return preg_replace_callback( 0101 $this->_attrEntitiesRegex, 0102 array($this, 'entityCallback'), 0103 $string 0104 ); 0105 } 0106 0107 /** 0108 * Callback function for substituteNonSpecialEntities() that does the work. 0109 * 0110 * @param array $matches PCRE matches array, with 0 the entire match, and 0111 * either index 1, 2 or 3 set with a hex value, dec value, 0112 * or string (respectively). 0113 * @return string Replacement string. 0114 */ 0115 0116 protected function entityCallback($matches) 0117 { 0118 $entity = $matches[0]; 0119 $hex_part = @$matches[1]; 0120 $dec_part = @$matches[2]; 0121 $named_part = empty($matches[3]) ? @$matches[4] : $matches[3]; 0122 if ($hex_part !== NULL && $hex_part !== "") { 0123 return HTMLPurifier_Encoder::unichr(hexdec($hex_part)); 0124 } elseif ($dec_part !== NULL && $dec_part !== "") { 0125 return HTMLPurifier_Encoder::unichr((int) $dec_part); 0126 } else { 0127 if (!$this->_entity_lookup) { 0128 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 0129 } 0130 if (isset($this->_entity_lookup->table[$named_part])) { 0131 return $this->_entity_lookup->table[$named_part]; 0132 } else { 0133 // exact match didn't match anything, so test if 0134 // any of the semicolon optional match the prefix. 0135 // Test that this is an EXACT match is important to 0136 // prevent infinite loop 0137 if (!empty($matches[3])) { 0138 return preg_replace_callback( 0139 $this->_semiOptionalPrefixRegex, 0140 array($this, 'entityCallback'), 0141 $entity 0142 ); 0143 } 0144 return $entity; 0145 } 0146 } 0147 } 0148 0149 // LEGACY CODE BELOW 0150 0151 /** 0152 * Callback regex string for parsing entities. 0153 * @type string 0154 */ 0155 protected $_substituteEntitiesRegex = 0156 '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/'; 0157 // 1. hex 2. dec 3. string (XML style) 0158 0159 /** 0160 * Decimal to parsed string conversion table for special entities. 0161 * @type array 0162 */ 0163 protected $_special_dec2str = 0164 array( 0165 34 => '"', 0166 38 => '&', 0167 39 => "'", 0168 60 => '<', 0169 62 => '>' 0170 ); 0171 0172 /** 0173 * Stripped entity names to decimal conversion table for special entities. 0174 * @type array 0175 */ 0176 protected $_special_ent2dec = 0177 array( 0178 'quot' => 34, 0179 'amp' => 38, 0180 'lt' => 60, 0181 'gt' => 62 0182 ); 0183 0184 /** 0185 * Substitutes non-special entities with their parsed equivalents. Since 0186 * running this whenever you have parsed character is t3h 5uck, we run 0187 * it before everything else. 0188 * 0189 * @param string $string String to have non-special entities parsed. 0190 * @return string Parsed string. 0191 */ 0192 public function substituteNonSpecialEntities($string) 0193 { 0194 // it will try to detect missing semicolons, but don't rely on it 0195 return preg_replace_callback( 0196 $this->_substituteEntitiesRegex, 0197 array($this, 'nonSpecialEntityCallback'), 0198 $string 0199 ); 0200 } 0201 0202 /** 0203 * Callback function for substituteNonSpecialEntities() that does the work. 0204 * 0205 * @param array $matches PCRE matches array, with 0 the entire match, and 0206 * either index 1, 2 or 3 set with a hex value, dec value, 0207 * or string (respectively). 0208 * @return string Replacement string. 0209 */ 0210 0211 protected function nonSpecialEntityCallback($matches) 0212 { 0213 // replaces all but big five 0214 $entity = $matches[0]; 0215 $is_num = (@$matches[0][1] === '#'); 0216 if ($is_num) { 0217 $is_hex = (@$entity[2] === 'x'); 0218 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 0219 // abort for special characters 0220 if (isset($this->_special_dec2str[$code])) { 0221 return $entity; 0222 } 0223 return HTMLPurifier_Encoder::unichr($code); 0224 } else { 0225 if (isset($this->_special_ent2dec[$matches[3]])) { 0226 return $entity; 0227 } 0228 if (!$this->_entity_lookup) { 0229 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 0230 } 0231 if (isset($this->_entity_lookup->table[$matches[3]])) { 0232 return $this->_entity_lookup->table[$matches[3]]; 0233 } else { 0234 return $entity; 0235 } 0236 } 0237 } 0238 0239 /** 0240 * Substitutes only special entities with their parsed equivalents. 0241 * 0242 * @notice We try to avoid calling this function because otherwise, it 0243 * would have to be called a lot (for every parsed section). 0244 * 0245 * @param string $string String to have non-special entities parsed. 0246 * @return string Parsed string. 0247 */ 0248 public function substituteSpecialEntities($string) 0249 { 0250 return preg_replace_callback( 0251 $this->_substituteEntitiesRegex, 0252 array($this, 'specialEntityCallback'), 0253 $string 0254 ); 0255 } 0256 0257 /** 0258 * Callback function for substituteSpecialEntities() that does the work. 0259 * 0260 * This callback has same syntax as nonSpecialEntityCallback(). 0261 * 0262 * @param array $matches PCRE-style matches array, with 0 the entire match, and 0263 * either index 1, 2 or 3 set with a hex value, dec value, 0264 * or string (respectively). 0265 * @return string Replacement string. 0266 */ 0267 protected function specialEntityCallback($matches) 0268 { 0269 $entity = $matches[0]; 0270 $is_num = (@$matches[0][1] === '#'); 0271 if ($is_num) { 0272 $is_hex = (@$entity[2] === 'x'); 0273 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 0274 return isset($this->_special_dec2str[$int]) ? 0275 $this->_special_dec2str[$int] : 0276 $entity; 0277 } else { 0278 return isset($this->_special_ent2dec[$matches[3]]) ? 0279 $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] : 0280 $entity; 0281 } 0282 } 0283 } 0284 0285 // vim: et sw=4 sts=4