File indexing completed on 2024-12-22 05:36:22

0001 <?php
0002 
0003 /**
0004  * Parses a URI into the components and fragment identifier as specified
0005  * by RFC 3986.
0006  */
0007 class HTMLPurifier_URIParser
0008 {
0009 
0010     /**
0011      * Instance of HTMLPurifier_PercentEncoder to do normalization with.
0012      */
0013     protected $percentEncoder;
0014 
0015     public function __construct()
0016     {
0017         $this->percentEncoder = new HTMLPurifier_PercentEncoder();
0018     }
0019 
0020     /**
0021      * Parses a URI.
0022      * @param $uri string URI to parse
0023      * @return HTMLPurifier_URI representation of URI. This representation has
0024      *         not been validated yet and may not conform to RFC.
0025      */
0026     public function parse($uri)
0027     {
0028         $uri = $this->percentEncoder->normalize($uri);
0029 
0030         // Regexp is as per Appendix B.
0031         // Note that ["<>] are an addition to the RFC's recommended
0032         // characters, because they represent external delimeters.
0033         $r_URI = '!'.
0034             '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
0035             '(//([^/?#"<>]*))?'. // 4. Authority
0036             '([^?#"<>]*)'.       // 5. Path
0037             '(\?([^#"<>]*))?'.   // 7. Query
0038             '(#([^"<>]*))?'.     // 8. Fragment
0039             '!';
0040 
0041         $matches = array();
0042         $result = preg_match($r_URI, $uri, $matches);
0043 
0044         if (!$result) return false; // *really* invalid URI
0045 
0046         // seperate out parts
0047         $scheme     = !empty($matches[1]) ? $matches[2] : null;
0048         $authority  = !empty($matches[3]) ? $matches[4] : null;
0049         $path       = $matches[5]; // always present, can be empty
0050         $query      = !empty($matches[6]) ? $matches[7] : null;
0051         $fragment   = !empty($matches[8]) ? $matches[9] : null;
0052 
0053         // further parse authority
0054         if ($authority !== null) {
0055             $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
0056             $matches = array();
0057             preg_match($r_authority, $authority, $matches);
0058             $userinfo   = !empty($matches[1]) ? $matches[2] : null;
0059             $host       = !empty($matches[3]) ? $matches[3] : '';
0060             $port       = !empty($matches[4]) ? (int) $matches[5] : null;
0061         } else {
0062             $port = $host = $userinfo = null;
0063         }
0064 
0065         return new HTMLPurifier_URI(
0066             $scheme, $userinfo, $host, $port, $path, $query, $fragment);
0067     }
0068 
0069 }
0070 
0071 // vim: et sw=4 sts=4