File indexing completed on 2024-06-23 05:55:39

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Search
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022 
0023 
0024 /**
0025  * @category   Zend
0026  * @package    Zend_Search_Lucene
0027  * @subpackage Search
0028  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0029  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0030  */
0031 abstract class Zend_Search_Lucene_Search_Similarity
0032 {
0033     /**
0034      * The Similarity implementation used by default.
0035      *
0036      * @var Zend_Search_Lucene_Search_Similarity
0037      */
0038     private static $_defaultImpl;
0039 
0040     /**
0041      * Cache of decoded bytes.
0042      * Array of floats
0043      *
0044      * @var array
0045      */
0046     private static $_normTable = array( 0   => 0.0,
0047                                         1   => 5.820766E-10,
0048                                         2   => 6.9849193E-10,
0049                                         3   => 8.1490725E-10,
0050                                         4   => 9.313226E-10,
0051                                         5   => 1.1641532E-9,
0052                                         6   => 1.3969839E-9,
0053                                         7   => 1.6298145E-9,
0054                                         8   => 1.8626451E-9,
0055                                         9   => 2.3283064E-9,
0056                                         10  => 2.7939677E-9,
0057                                         11  => 3.259629E-9,
0058                                         12  => 3.7252903E-9,
0059                                         13  => 4.656613E-9,
0060                                         14  => 5.5879354E-9,
0061                                         15  => 6.519258E-9,
0062                                         16  => 7.4505806E-9,
0063                                         17  => 9.313226E-9,
0064                                         18  => 1.1175871E-8,
0065                                         19  => 1.3038516E-8,
0066                                         20  => 1.4901161E-8,
0067                                         21  => 1.8626451E-8,
0068                                         22  => 2.2351742E-8,
0069                                         23  => 2.6077032E-8,
0070                                         24  => 2.9802322E-8,
0071                                         25  => 3.7252903E-8,
0072                                         26  => 4.4703484E-8,
0073                                         27  => 5.2154064E-8,
0074                                         28  => 5.9604645E-8,
0075                                         29  => 7.4505806E-8,
0076                                         30  => 8.940697E-8,
0077                                         31  => 1.0430813E-7,
0078                                         32  => 1.1920929E-7,
0079                                         33  => 1.4901161E-7,
0080                                         34  => 1.7881393E-7,
0081                                         35  => 2.0861626E-7,
0082                                         36  => 2.3841858E-7,
0083                                         37  => 2.9802322E-7,
0084                                         38  => 3.5762787E-7,
0085                                         39  => 4.172325E-7,
0086                                         40  => 4.7683716E-7,
0087                                         41  => 5.9604645E-7,
0088                                         42  => 7.1525574E-7,
0089                                         43  => 8.34465E-7,
0090                                         44  => 9.536743E-7,
0091                                         45  => 1.1920929E-6,
0092                                         46  => 1.4305115E-6,
0093                                         47  => 1.66893E-6,
0094                                         48  => 1.9073486E-6,
0095                                         49  => 2.3841858E-6,
0096                                         50  => 2.861023E-6,
0097                                         51  => 3.33786E-6,
0098                                         52  => 3.8146973E-6,
0099                                         53  => 4.7683716E-6,
0100                                         54  => 5.722046E-6,
0101                                         55  => 6.67572E-6,
0102                                         56  => 7.6293945E-6,
0103                                         57  => 9.536743E-6,
0104                                         58  => 1.1444092E-5,
0105                                         59  => 1.335144E-5,
0106                                         60  => 1.5258789E-5,
0107                                         61  => 1.9073486E-5,
0108                                         62  => 2.2888184E-5,
0109                                         63  => 2.670288E-5,
0110                                         64  => 3.0517578E-5,
0111                                         65  => 3.8146973E-5,
0112                                         66  => 4.5776367E-5,
0113                                         67  => 5.340576E-5,
0114                                         68  => 6.1035156E-5,
0115                                         69  => 7.6293945E-5,
0116                                         70  => 9.1552734E-5,
0117                                         71  => 1.0681152E-4,
0118                                         72  => 1.2207031E-4,
0119                                         73  => 1.5258789E-4,
0120                                         74  => 1.8310547E-4,
0121                                         75  => 2.1362305E-4,
0122                                         76  => 2.4414062E-4,
0123                                         77  => 3.0517578E-4,
0124                                         78  => 3.6621094E-4,
0125                                         79  => 4.272461E-4,
0126                                         80  => 4.8828125E-4,
0127                                         81  => 6.1035156E-4,
0128                                         82  => 7.324219E-4,
0129                                         83  => 8.544922E-4,
0130                                         84  => 9.765625E-4,
0131                                         85  => 0.0012207031,
0132                                         86  => 0.0014648438,
0133                                         87  => 0.0017089844,
0134                                         88  => 0.001953125,
0135                                         89  => 0.0024414062,
0136                                         90  => 0.0029296875,
0137                                         91  => 0.0034179688,
0138                                         92  => 0.00390625,
0139                                         93  => 0.0048828125,
0140                                         94  => 0.005859375,
0141                                         95  => 0.0068359375,
0142                                         96  => 0.0078125,
0143                                         97  => 0.009765625,
0144                                         98  => 0.01171875,
0145                                         99  => 0.013671875,
0146                                         100 => 0.015625,
0147                                         101 => 0.01953125,
0148                                         102 => 0.0234375,
0149                                         103 => 0.02734375,
0150                                         104 => 0.03125,
0151                                         105 => 0.0390625,
0152                                         106 => 0.046875,
0153                                         107 => 0.0546875,
0154                                         108 => 0.0625,
0155                                         109 => 0.078125,
0156                                         110 => 0.09375,
0157                                         111 => 0.109375,
0158                                         112 => 0.125,
0159                                         113 => 0.15625,
0160                                         114 => 0.1875,
0161                                         115 => 0.21875,
0162                                         116 => 0.25,
0163                                         117 => 0.3125,
0164                                         118 => 0.375,
0165                                         119 => 0.4375,
0166                                         120 => 0.5,
0167                                         121 => 0.625,
0168                                         122 => 0.75,
0169                                         123 => 0.875,
0170                                         124 => 1.0,
0171                                         125 => 1.25,
0172                                         126 => 1.5,
0173                                         127 => 1.75,
0174                                         128 => 2.0,
0175                                         129 => 2.5,
0176                                         130 => 3.0,
0177                                         131 => 3.5,
0178                                         132 => 4.0,
0179                                         133 => 5.0,
0180                                         134 => 6.0,
0181                                         135 => 7.0,
0182                                         136 => 8.0,
0183                                         137 => 10.0,
0184                                         138 => 12.0,
0185                                         139 => 14.0,
0186                                         140 => 16.0,
0187                                         141 => 20.0,
0188                                         142 => 24.0,
0189                                         143 => 28.0,
0190                                         144 => 32.0,
0191                                         145 => 40.0,
0192                                         146 => 48.0,
0193                                         147 => 56.0,
0194                                         148 => 64.0,
0195                                         149 => 80.0,
0196                                         150 => 96.0,
0197                                         151 => 112.0,
0198                                         152 => 128.0,
0199                                         153 => 160.0,
0200                                         154 => 192.0,
0201                                         155 => 224.0,
0202                                         156 => 256.0,
0203                                         157 => 320.0,
0204                                         158 => 384.0,
0205                                         159 => 448.0,
0206                                         160 => 512.0,
0207                                         161 => 640.0,
0208                                         162 => 768.0,
0209                                         163 => 896.0,
0210                                         164 => 1024.0,
0211                                         165 => 1280.0,
0212                                         166 => 1536.0,
0213                                         167 => 1792.0,
0214                                         168 => 2048.0,
0215                                         169 => 2560.0,
0216                                         170 => 3072.0,
0217                                         171 => 3584.0,
0218                                         172 => 4096.0,
0219                                         173 => 5120.0,
0220                                         174 => 6144.0,
0221                                         175 => 7168.0,
0222                                         176 => 8192.0,
0223                                         177 => 10240.0,
0224                                         178 => 12288.0,
0225                                         179 => 14336.0,
0226                                         180 => 16384.0,
0227                                         181 => 20480.0,
0228                                         182 => 24576.0,
0229                                         183 => 28672.0,
0230                                         184 => 32768.0,
0231                                         185 => 40960.0,
0232                                         186 => 49152.0,
0233                                         187 => 57344.0,
0234                                         188 => 65536.0,
0235                                         189 => 81920.0,
0236                                         190 => 98304.0,
0237                                         191 => 114688.0,
0238                                         192 => 131072.0,
0239                                         193 => 163840.0,
0240                                         194 => 196608.0,
0241                                         195 => 229376.0,
0242                                         196 => 262144.0,
0243                                         197 => 327680.0,
0244                                         198 => 393216.0,
0245                                         199 => 458752.0,
0246                                         200 => 524288.0,
0247                                         201 => 655360.0,
0248                                         202 => 786432.0,
0249                                         203 => 917504.0,
0250                                         204 => 1048576.0,
0251                                         205 => 1310720.0,
0252                                         206 => 1572864.0,
0253                                         207 => 1835008.0,
0254                                         208 => 2097152.0,
0255                                         209 => 2621440.0,
0256                                         210 => 3145728.0,
0257                                         211 => 3670016.0,
0258                                         212 => 4194304.0,
0259                                         213 => 5242880.0,
0260                                         214 => 6291456.0,
0261                                         215 => 7340032.0,
0262                                         216 => 8388608.0,
0263                                         217 => 1.048576E7,
0264                                         218 => 1.2582912E7,
0265                                         219 => 1.4680064E7,
0266                                         220 => 1.6777216E7,
0267                                         221 => 2.097152E7,
0268                                         222 => 2.5165824E7,
0269                                         223 => 2.9360128E7,
0270                                         224 => 3.3554432E7,
0271                                         225 => 4.194304E7,
0272                                         226 => 5.0331648E7,
0273                                         227 => 5.8720256E7,
0274                                         228 => 6.7108864E7,
0275                                         229 => 8.388608E7,
0276                                         230 => 1.00663296E8,
0277                                         231 => 1.17440512E8,
0278                                         232 => 1.34217728E8,
0279                                         233 => 1.6777216E8,
0280                                         234 => 2.01326592E8,
0281                                         235 => 2.34881024E8,
0282                                         236 => 2.68435456E8,
0283                                         237 => 3.3554432E8,
0284                                         238 => 4.02653184E8,
0285                                         239 => 4.69762048E8,
0286                                         240 => 5.3687091E8,
0287                                         241 => 6.7108864E8,
0288                                         242 => 8.0530637E8,
0289                                         243 => 9.395241E8,
0290                                         244 => 1.07374182E9,
0291                                         245 => 1.34217728E9,
0292                                         246 => 1.61061274E9,
0293                                         247 => 1.87904819E9,
0294                                         248 => 2.14748365E9,
0295                                         249 => 2.68435456E9,
0296                                         250 => 3.22122547E9,
0297                                         251 => 3.75809638E9,
0298                                         252 => 4.2949673E9,
0299                                         253 => 5.3687091E9,
0300                                         254 => 6.4424509E9,
0301                                         255 => 7.5161928E9 );
0302 
0303 
0304     /**
0305      * Set the default Similarity implementation used by indexing and search
0306      * code.
0307      *
0308      * @param Zend_Search_Lucene_Search_Similarity $similarity
0309      */
0310     public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
0311     {
0312         self::$_defaultImpl = $similarity;
0313     }
0314 
0315 
0316     /**
0317      * Return the default Similarity implementation used by indexing and search
0318      * code.
0319      *
0320      * @return Zend_Search_Lucene_Search_Similarity
0321      */
0322     public static function getDefault()
0323     {
0324         if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
0325             // require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
0326             self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
0327         }
0328 
0329         return self::$_defaultImpl;
0330     }
0331 
0332 
0333     /**
0334      * Computes the normalization value for a field given the total number of
0335      * terms contained in a field.  These values, together with field boosts, are
0336      * stored in an index and multipled into scores for hits on each field by the
0337      * search code.
0338      *
0339      * Matches in longer fields are less precise, so implemenations of this
0340      * method usually return smaller values when 'numTokens' is large,
0341      * and larger values when 'numTokens' is small.
0342      *
0343      * That these values are computed under
0344      * IndexWriter::addDocument(Document) and stored then using
0345      * encodeNorm(float).  Thus they have limited precision, and documents
0346      * must be re-indexed if this method is altered.
0347      *
0348      * fieldName - name of field
0349      * numTokens - the total number of tokens contained in fields named
0350      *             'fieldName' of 'doc'.
0351      * Returns a normalization factor for hits on this field of this document
0352      *
0353      * @param string $fieldName
0354      * @param integer $numTokens
0355      * @return float
0356      */
0357     abstract public function lengthNorm($fieldName, $numTokens);
0358 
0359     /**
0360      * Computes the normalization value for a query given the sum of the squared
0361      * weights of each of the query terms.  This value is then multipled into the
0362      * weight of each query term.
0363      *
0364      * This does not affect ranking, but rather just attempts to make scores
0365      * from different queries comparable.
0366      *
0367      * sumOfSquaredWeights - the sum of the squares of query term weights
0368      * Returns a normalization factor for query weights
0369      *
0370      * @param float $sumOfSquaredWeights
0371      * @return float
0372      */
0373     abstract public function queryNorm($sumOfSquaredWeights);
0374 
0375 
0376     /**
0377      *  Decodes a normalization factor stored in an index.
0378      *
0379      * @param integer $byte
0380      * @return float
0381      */
0382     public static function decodeNorm($byte)
0383     {
0384         return self::$_normTable[$byte & 0xFF];
0385     }
0386 
0387 
0388     /**
0389      * Encodes a normalization factor for storage in an index.
0390      *
0391      * The encoding uses a five-bit exponent and three-bit mantissa, thus
0392      * representing values from around 7x10^9 to 2x10^-9 with about one
0393      * significant decimal digit of accuracy.  Zero is also represented.
0394      * Negative numbers are rounded up to zero.  Values too large to represent
0395      * are rounded down to the largest representable value.  Positive values too
0396      * small to represent are rounded up to the smallest positive representable
0397      * value.
0398      *
0399      * @param float $f
0400      * @return integer
0401      */
0402     static function encodeNorm($f)
0403     {
0404       return self::_floatToByte($f);
0405     }
0406 
0407     /**
0408      * Float to byte conversion
0409      *
0410      * @param integer $b
0411      * @return float
0412      */
0413     private static function _floatToByte($f)
0414     {
0415         // round negatives up to zero
0416         if ($f <= 0.0) {
0417             return 0;
0418         }
0419 
0420         // search for appropriate value
0421         $lowIndex = 0;
0422         $highIndex = 255;
0423         while ($highIndex >= $lowIndex) {
0424             // $mid = ($highIndex - $lowIndex)/2;
0425             $mid = ($highIndex + $lowIndex) >> 1;
0426             $delta = $f - self::$_normTable[$mid];
0427 
0428             if ($delta < 0) {
0429                 $highIndex = $mid-1;
0430             } elseif ($delta > 0) {
0431                 $lowIndex  = $mid+1;
0432             } else {
0433                 return $mid; // We got it!
0434             }
0435         }
0436 
0437         // round to closest value
0438         if ($highIndex != 255 &&
0439             $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
0440             return $highIndex + 1;
0441         } else {
0442             return $highIndex;
0443         }
0444     }
0445 
0446 
0447     /**
0448      * Computes a score factor based on a term or phrase's frequency in a
0449      * document.  This value is multiplied by the idf(Term, Searcher)
0450      * factor for each term in the query and these products are then summed to
0451      * form the initial score for a document.
0452      *
0453      * Terms and phrases repeated in a document indicate the topic of the
0454      * document, so implementations of this method usually return larger values
0455      * when 'freq' is large, and smaller values when 'freq'
0456      * is small.
0457      *
0458      * freq - the frequency of a term within a document
0459      * Returns a score factor based on a term's within-document frequency
0460      *
0461      * @param float $freq
0462      * @return float
0463      */
0464     abstract public function tf($freq);
0465 
0466     /**
0467      * Computes the amount of a sloppy phrase match, based on an edit distance.
0468      * This value is summed for each sloppy phrase match in a document to form
0469      * the frequency that is passed to tf(float).
0470      *
0471      * A phrase match with a small edit distance to a document passage more
0472      * closely matches the document, so implementations of this method usually
0473      * return larger values when the edit distance is small and smaller values
0474      * when it is large.
0475      *
0476      * distance - the edit distance of this sloppy phrase match
0477      * Returns the frequency increment for this match
0478      *
0479      * @param integer $distance
0480      * @return float
0481      */
0482     abstract public function sloppyFreq($distance);
0483 
0484 
0485     /**
0486      * Computes a score factor for a simple term or a phrase.
0487      *
0488      * The default implementation is:
0489      *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
0490      *
0491      * input - the term in question or array of terms
0492      * reader - reader the document collection being searched
0493      * Returns a score factor for the term
0494      *
0495      * @param mixed $input
0496      * @param Zend_Search_Lucene_Interface $reader
0497      * @return a score factor for the term
0498      */
0499     public function idf($input, Zend_Search_Lucene_Interface $reader)
0500     {
0501         if (!is_array($input)) {
0502             return $this->idfFreq($reader->docFreq($input), $reader->count());
0503         } else {
0504             $idf = 0.0;
0505             foreach ($input as $term) {
0506                 $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
0507             }
0508             return $idf;
0509         }
0510     }
0511 
0512     /**
0513      * Computes a score factor based on a term's document frequency (the number
0514      * of documents which contain the term).  This value is multiplied by the
0515      * tf(int) factor for each term in the query and these products are
0516      * then summed to form the initial score for a document.
0517      *
0518      * Terms that occur in fewer documents are better indicators of topic, so
0519      * implemenations of this method usually return larger values for rare terms,
0520      * and smaller values for common terms.
0521      *
0522      * docFreq - the number of documents which contain the term
0523      * numDocs - the total number of documents in the collection
0524      * Returns a score factor based on the term's document frequency
0525      *
0526      * @param integer $docFreq
0527      * @param integer $numDocs
0528      * @return float
0529      */
0530     abstract public function idfFreq($docFreq, $numDocs);
0531 
0532     /**
0533      * Computes a score factor based on the fraction of all query terms that a
0534      * document contains.  This value is multiplied into scores.
0535      *
0536      * The presence of a large portion of the query terms indicates a better
0537      * match with the query, so implemenations of this method usually return
0538      * larger values when the ratio between these parameters is large and smaller
0539      * values when the ratio between them is small.
0540      *
0541      * overlap - the number of query terms matched in the document
0542      * maxOverlap - the total number of terms in the query
0543      * Returns a score factor based on term overlap with the query
0544      *
0545      * @param integer $overlap
0546      * @param integer $maxOverlap
0547      * @return float
0548      */
0549     abstract public function coord($overlap, $maxOverlap);
0550 }
0551