File indexing completed on 2024-12-22 05:36:21

0001 <?php
0002 
0003 // if want to implement error collecting here, we'll need to use some sort
0004 // of global data (probably trigger_error) because it's impossible to pass
0005 // $config or $context to the callback functions.
0006 
0007 /**
0008  * Handles referencing and derefencing character entities
0009  */
0010 class HTMLPurifier_EntityParser
0011 {
0012 
0013     /**
0014      * Reference to entity lookup table.
0015      * @type HTMLPurifier_EntityLookup
0016      */
0017     protected $_entity_lookup;
0018 
0019     /**
0020      * Callback regex string for entities in text.
0021      * @type string
0022      */
0023     protected $_textEntitiesRegex;
0024 
0025     /**
0026      * Callback regex string for entities in attributes.
0027      * @type string
0028      */
0029     protected $_attrEntitiesRegex;
0030 
0031     /**
0032      * Tests if the beginning of a string is a semi-optional regex
0033      */
0034     protected $_semiOptionalPrefixRegex;
0035 
0036     public function __construct() {
0037         // From
0038         // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
0039         $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
0040 
0041         // NB: three empty captures to put the fourth match in the right
0042         // place
0043         $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
0044 
0045         $this->_textEntitiesRegex =
0046             '/&(?:'.
0047             // hex
0048             '[#]x([a-fA-F0-9]+);?|'.
0049             // dec
0050             '[#]0*(\d+);?|'.
0051             // string (mandatory semicolon)
0052             // NB: order matters: match semicolon preferentially
0053             '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
0054             // string (optional semicolon)
0055             "($semi_optional)".
0056             ')/';
0057 
0058         $this->_attrEntitiesRegex =
0059             '/&(?:'.
0060             // hex
0061             '[#]x([a-fA-F0-9]+);?|'.
0062             // dec
0063             '[#]0*(\d+);?|'.
0064             // string (mandatory semicolon)
0065             // NB: order matters: match semicolon preferentially
0066             '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
0067             // string (optional semicolon)
0068             // don't match if trailing is equals or alphanumeric (URL
0069             // like)
0070             "($semi_optional)(?![=;A-Za-z0-9])".
0071             ')/';
0072 
0073     }
0074 
0075     /**
0076      * Substitute entities with the parsed equivalents.  Use this on
0077      * textual data in an HTML document (as opposed to attributes.)
0078      *
0079      * @param string $string String to have entities parsed.
0080      * @return string Parsed string.
0081      */
0082     public function substituteTextEntities($string)
0083     {
0084         return preg_replace_callback(
0085             $this->_textEntitiesRegex,
0086             array($this, 'entityCallback'),
0087             $string
0088         );
0089     }
0090 
0091     /**
0092      * Substitute entities with the parsed equivalents.  Use this on
0093      * attribute contents in documents.
0094      *
0095      * @param string $string String to have entities parsed.
0096      * @return string Parsed string.
0097      */
0098     public function substituteAttrEntities($string)
0099     {
0100         return preg_replace_callback(
0101             $this->_attrEntitiesRegex,
0102             array($this, 'entityCallback'),
0103             $string
0104         );
0105     }
0106 
0107     /**
0108      * Callback function for substituteNonSpecialEntities() that does the work.
0109      *
0110      * @param array $matches  PCRE matches array, with 0 the entire match, and
0111      *                  either index 1, 2 or 3 set with a hex value, dec value,
0112      *                  or string (respectively).
0113      * @return string Replacement string.
0114      */
0115 
0116     protected function entityCallback($matches)
0117     {
0118         $entity = $matches[0];
0119         $hex_part = @$matches[1];
0120         $dec_part = @$matches[2];
0121         $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
0122         if ($hex_part !== NULL && $hex_part !== "") {
0123             return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
0124         } elseif ($dec_part !== NULL && $dec_part !== "") {
0125             return HTMLPurifier_Encoder::unichr((int) $dec_part);
0126         } else {
0127             if (!$this->_entity_lookup) {
0128                 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
0129             }
0130             if (isset($this->_entity_lookup->table[$named_part])) {
0131                 return $this->_entity_lookup->table[$named_part];
0132             } else {
0133                 // exact match didn't match anything, so test if
0134                 // any of the semicolon optional match the prefix.
0135                 // Test that this is an EXACT match is important to
0136                 // prevent infinite loop
0137                 if (!empty($matches[3])) {
0138                     return preg_replace_callback(
0139                         $this->_semiOptionalPrefixRegex,
0140                         array($this, 'entityCallback'),
0141                         $entity
0142                     );
0143                 }
0144                 return $entity;
0145             }
0146         }
0147     }
0148 
0149     // LEGACY CODE BELOW
0150 
0151     /**
0152      * Callback regex string for parsing entities.
0153      * @type string
0154      */
0155     protected $_substituteEntitiesRegex =
0156         '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
0157         //     1. hex             2. dec      3. string (XML style)
0158 
0159     /**
0160      * Decimal to parsed string conversion table for special entities.
0161      * @type array
0162      */
0163     protected $_special_dec2str =
0164             array(
0165                     34 => '"',
0166                     38 => '&',
0167                     39 => "'",
0168                     60 => '<',
0169                     62 => '>'
0170             );
0171 
0172     /**
0173      * Stripped entity names to decimal conversion table for special entities.
0174      * @type array
0175      */
0176     protected $_special_ent2dec =
0177             array(
0178                     'quot' => 34,
0179                     'amp'  => 38,
0180                     'lt'   => 60,
0181                     'gt'   => 62
0182             );
0183 
0184     /**
0185      * Substitutes non-special entities with their parsed equivalents. Since
0186      * running this whenever you have parsed character is t3h 5uck, we run
0187      * it before everything else.
0188      *
0189      * @param string $string String to have non-special entities parsed.
0190      * @return string Parsed string.
0191      */
0192     public function substituteNonSpecialEntities($string)
0193     {
0194         // it will try to detect missing semicolons, but don't rely on it
0195         return preg_replace_callback(
0196             $this->_substituteEntitiesRegex,
0197             array($this, 'nonSpecialEntityCallback'),
0198             $string
0199         );
0200     }
0201 
0202     /**
0203      * Callback function for substituteNonSpecialEntities() that does the work.
0204      *
0205      * @param array $matches  PCRE matches array, with 0 the entire match, and
0206      *                  either index 1, 2 or 3 set with a hex value, dec value,
0207      *                  or string (respectively).
0208      * @return string Replacement string.
0209      */
0210 
0211     protected function nonSpecialEntityCallback($matches)
0212     {
0213         // replaces all but big five
0214         $entity = $matches[0];
0215         $is_num = (@$matches[0][1] === '#');
0216         if ($is_num) {
0217             $is_hex = (@$entity[2] === 'x');
0218             $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
0219             // abort for special characters
0220             if (isset($this->_special_dec2str[$code])) {
0221                 return $entity;
0222             }
0223             return HTMLPurifier_Encoder::unichr($code);
0224         } else {
0225             if (isset($this->_special_ent2dec[$matches[3]])) {
0226                 return $entity;
0227             }
0228             if (!$this->_entity_lookup) {
0229                 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
0230             }
0231             if (isset($this->_entity_lookup->table[$matches[3]])) {
0232                 return $this->_entity_lookup->table[$matches[3]];
0233             } else {
0234                 return $entity;
0235             }
0236         }
0237     }
0238 
0239     /**
0240      * Substitutes only special entities with their parsed equivalents.
0241      *
0242      * @notice We try to avoid calling this function because otherwise, it
0243      * would have to be called a lot (for every parsed section).
0244      *
0245      * @param string $string String to have non-special entities parsed.
0246      * @return string Parsed string.
0247      */
0248     public function substituteSpecialEntities($string)
0249     {
0250         return preg_replace_callback(
0251             $this->_substituteEntitiesRegex,
0252             array($this, 'specialEntityCallback'),
0253             $string
0254         );
0255     }
0256 
0257     /**
0258      * Callback function for substituteSpecialEntities() that does the work.
0259      *
0260      * This callback has same syntax as nonSpecialEntityCallback().
0261      *
0262      * @param array $matches  PCRE-style matches array, with 0 the entire match, and
0263      *                  either index 1, 2 or 3 set with a hex value, dec value,
0264      *                  or string (respectively).
0265      * @return string Replacement string.
0266      */
0267     protected function specialEntityCallback($matches)
0268     {
0269         $entity = $matches[0];
0270         $is_num = (@$matches[0][1] === '#');
0271         if ($is_num) {
0272             $is_hex = (@$entity[2] === 'x');
0273             $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
0274             return isset($this->_special_dec2str[$int]) ?
0275                 $this->_special_dec2str[$int] :
0276                 $entity;
0277         } else {
0278             return isset($this->_special_ent2dec[$matches[3]]) ?
0279                 $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
0280                 $entity;
0281         }
0282     }
0283 }
0284 
0285 // vim: et sw=4 sts=4