File indexing completed on 2024-06-23 05:55:39
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Search 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 0024 /** 0025 * @category Zend 0026 * @package Zend_Search_Lucene 0027 * @subpackage Search 0028 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0029 * @license http://framework.zend.com/license/new-bsd New BSD License 0030 */ 0031 abstract class Zend_Search_Lucene_Search_Similarity 0032 { 0033 /** 0034 * The Similarity implementation used by default. 0035 * 0036 * @var Zend_Search_Lucene_Search_Similarity 0037 */ 0038 private static $_defaultImpl; 0039 0040 /** 0041 * Cache of decoded bytes. 0042 * Array of floats 0043 * 0044 * @var array 0045 */ 0046 private static $_normTable = array( 0 => 0.0, 0047 1 => 5.820766E-10, 0048 2 => 6.9849193E-10, 0049 3 => 8.1490725E-10, 0050 4 => 9.313226E-10, 0051 5 => 1.1641532E-9, 0052 6 => 1.3969839E-9, 0053 7 => 1.6298145E-9, 0054 8 => 1.8626451E-9, 0055 9 => 2.3283064E-9, 0056 10 => 2.7939677E-9, 0057 11 => 3.259629E-9, 0058 12 => 3.7252903E-9, 0059 13 => 4.656613E-9, 0060 14 => 5.5879354E-9, 0061 15 => 6.519258E-9, 0062 16 => 7.4505806E-9, 0063 17 => 9.313226E-9, 0064 18 => 1.1175871E-8, 0065 19 => 1.3038516E-8, 0066 20 => 1.4901161E-8, 0067 21 => 1.8626451E-8, 0068 22 => 2.2351742E-8, 0069 23 => 2.6077032E-8, 0070 24 => 2.9802322E-8, 0071 25 => 3.7252903E-8, 0072 26 => 4.4703484E-8, 0073 27 => 5.2154064E-8, 0074 28 => 5.9604645E-8, 0075 29 => 7.4505806E-8, 0076 30 => 8.940697E-8, 0077 31 => 1.0430813E-7, 0078 32 => 1.1920929E-7, 0079 33 => 1.4901161E-7, 0080 34 => 1.7881393E-7, 0081 35 => 2.0861626E-7, 0082 36 => 2.3841858E-7, 0083 37 => 2.9802322E-7, 0084 38 => 3.5762787E-7, 0085 39 => 4.172325E-7, 0086 40 => 4.7683716E-7, 0087 41 => 5.9604645E-7, 0088 42 => 7.1525574E-7, 0089 43 => 8.34465E-7, 0090 44 => 9.536743E-7, 0091 45 => 1.1920929E-6, 0092 46 => 1.4305115E-6, 0093 47 => 1.66893E-6, 0094 48 => 1.9073486E-6, 0095 49 => 2.3841858E-6, 0096 50 => 2.861023E-6, 0097 51 => 3.33786E-6, 0098 52 => 3.8146973E-6, 0099 53 => 4.7683716E-6, 0100 54 => 5.722046E-6, 0101 55 => 6.67572E-6, 0102 56 => 7.6293945E-6, 0103 57 => 9.536743E-6, 0104 58 => 1.1444092E-5, 0105 59 => 1.335144E-5, 0106 60 => 1.5258789E-5, 0107 61 => 1.9073486E-5, 0108 62 => 2.2888184E-5, 0109 63 => 2.670288E-5, 0110 64 => 3.0517578E-5, 0111 65 => 3.8146973E-5, 0112 66 => 4.5776367E-5, 0113 67 => 5.340576E-5, 0114 68 => 6.1035156E-5, 0115 69 => 7.6293945E-5, 0116 70 => 9.1552734E-5, 0117 71 => 1.0681152E-4, 0118 72 => 1.2207031E-4, 0119 73 => 1.5258789E-4, 0120 74 => 1.8310547E-4, 0121 75 => 2.1362305E-4, 0122 76 => 2.4414062E-4, 0123 77 => 3.0517578E-4, 0124 78 => 3.6621094E-4, 0125 79 => 4.272461E-4, 0126 80 => 4.8828125E-4, 0127 81 => 6.1035156E-4, 0128 82 => 7.324219E-4, 0129 83 => 8.544922E-4, 0130 84 => 9.765625E-4, 0131 85 => 0.0012207031, 0132 86 => 0.0014648438, 0133 87 => 0.0017089844, 0134 88 => 0.001953125, 0135 89 => 0.0024414062, 0136 90 => 0.0029296875, 0137 91 => 0.0034179688, 0138 92 => 0.00390625, 0139 93 => 0.0048828125, 0140 94 => 0.005859375, 0141 95 => 0.0068359375, 0142 96 => 0.0078125, 0143 97 => 0.009765625, 0144 98 => 0.01171875, 0145 99 => 0.013671875, 0146 100 => 0.015625, 0147 101 => 0.01953125, 0148 102 => 0.0234375, 0149 103 => 0.02734375, 0150 104 => 0.03125, 0151 105 => 0.0390625, 0152 106 => 0.046875, 0153 107 => 0.0546875, 0154 108 => 0.0625, 0155 109 => 0.078125, 0156 110 => 0.09375, 0157 111 => 0.109375, 0158 112 => 0.125, 0159 113 => 0.15625, 0160 114 => 0.1875, 0161 115 => 0.21875, 0162 116 => 0.25, 0163 117 => 0.3125, 0164 118 => 0.375, 0165 119 => 0.4375, 0166 120 => 0.5, 0167 121 => 0.625, 0168 122 => 0.75, 0169 123 => 0.875, 0170 124 => 1.0, 0171 125 => 1.25, 0172 126 => 1.5, 0173 127 => 1.75, 0174 128 => 2.0, 0175 129 => 2.5, 0176 130 => 3.0, 0177 131 => 3.5, 0178 132 => 4.0, 0179 133 => 5.0, 0180 134 => 6.0, 0181 135 => 7.0, 0182 136 => 8.0, 0183 137 => 10.0, 0184 138 => 12.0, 0185 139 => 14.0, 0186 140 => 16.0, 0187 141 => 20.0, 0188 142 => 24.0, 0189 143 => 28.0, 0190 144 => 32.0, 0191 145 => 40.0, 0192 146 => 48.0, 0193 147 => 56.0, 0194 148 => 64.0, 0195 149 => 80.0, 0196 150 => 96.0, 0197 151 => 112.0, 0198 152 => 128.0, 0199 153 => 160.0, 0200 154 => 192.0, 0201 155 => 224.0, 0202 156 => 256.0, 0203 157 => 320.0, 0204 158 => 384.0, 0205 159 => 448.0, 0206 160 => 512.0, 0207 161 => 640.0, 0208 162 => 768.0, 0209 163 => 896.0, 0210 164 => 1024.0, 0211 165 => 1280.0, 0212 166 => 1536.0, 0213 167 => 1792.0, 0214 168 => 2048.0, 0215 169 => 2560.0, 0216 170 => 3072.0, 0217 171 => 3584.0, 0218 172 => 4096.0, 0219 173 => 5120.0, 0220 174 => 6144.0, 0221 175 => 7168.0, 0222 176 => 8192.0, 0223 177 => 10240.0, 0224 178 => 12288.0, 0225 179 => 14336.0, 0226 180 => 16384.0, 0227 181 => 20480.0, 0228 182 => 24576.0, 0229 183 => 28672.0, 0230 184 => 32768.0, 0231 185 => 40960.0, 0232 186 => 49152.0, 0233 187 => 57344.0, 0234 188 => 65536.0, 0235 189 => 81920.0, 0236 190 => 98304.0, 0237 191 => 114688.0, 0238 192 => 131072.0, 0239 193 => 163840.0, 0240 194 => 196608.0, 0241 195 => 229376.0, 0242 196 => 262144.0, 0243 197 => 327680.0, 0244 198 => 393216.0, 0245 199 => 458752.0, 0246 200 => 524288.0, 0247 201 => 655360.0, 0248 202 => 786432.0, 0249 203 => 917504.0, 0250 204 => 1048576.0, 0251 205 => 1310720.0, 0252 206 => 1572864.0, 0253 207 => 1835008.0, 0254 208 => 2097152.0, 0255 209 => 2621440.0, 0256 210 => 3145728.0, 0257 211 => 3670016.0, 0258 212 => 4194304.0, 0259 213 => 5242880.0, 0260 214 => 6291456.0, 0261 215 => 7340032.0, 0262 216 => 8388608.0, 0263 217 => 1.048576E7, 0264 218 => 1.2582912E7, 0265 219 => 1.4680064E7, 0266 220 => 1.6777216E7, 0267 221 => 2.097152E7, 0268 222 => 2.5165824E7, 0269 223 => 2.9360128E7, 0270 224 => 3.3554432E7, 0271 225 => 4.194304E7, 0272 226 => 5.0331648E7, 0273 227 => 5.8720256E7, 0274 228 => 6.7108864E7, 0275 229 => 8.388608E7, 0276 230 => 1.00663296E8, 0277 231 => 1.17440512E8, 0278 232 => 1.34217728E8, 0279 233 => 1.6777216E8, 0280 234 => 2.01326592E8, 0281 235 => 2.34881024E8, 0282 236 => 2.68435456E8, 0283 237 => 3.3554432E8, 0284 238 => 4.02653184E8, 0285 239 => 4.69762048E8, 0286 240 => 5.3687091E8, 0287 241 => 6.7108864E8, 0288 242 => 8.0530637E8, 0289 243 => 9.395241E8, 0290 244 => 1.07374182E9, 0291 245 => 1.34217728E9, 0292 246 => 1.61061274E9, 0293 247 => 1.87904819E9, 0294 248 => 2.14748365E9, 0295 249 => 2.68435456E9, 0296 250 => 3.22122547E9, 0297 251 => 3.75809638E9, 0298 252 => 4.2949673E9, 0299 253 => 5.3687091E9, 0300 254 => 6.4424509E9, 0301 255 => 7.5161928E9 ); 0302 0303 0304 /** 0305 * Set the default Similarity implementation used by indexing and search 0306 * code. 0307 * 0308 * @param Zend_Search_Lucene_Search_Similarity $similarity 0309 */ 0310 public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) 0311 { 0312 self::$_defaultImpl = $similarity; 0313 } 0314 0315 0316 /** 0317 * Return the default Similarity implementation used by indexing and search 0318 * code. 0319 * 0320 * @return Zend_Search_Lucene_Search_Similarity 0321 */ 0322 public static function getDefault() 0323 { 0324 if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { 0325 // require_once 'Zend/Search/Lucene/Search/Similarity/Default.php'; 0326 self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); 0327 } 0328 0329 return self::$_defaultImpl; 0330 } 0331 0332 0333 /** 0334 * Computes the normalization value for a field given the total number of 0335 * terms contained in a field. These values, together with field boosts, are 0336 * stored in an index and multipled into scores for hits on each field by the 0337 * search code. 0338 * 0339 * Matches in longer fields are less precise, so implemenations of this 0340 * method usually return smaller values when 'numTokens' is large, 0341 * and larger values when 'numTokens' is small. 0342 * 0343 * That these values are computed under 0344 * IndexWriter::addDocument(Document) and stored then using 0345 * encodeNorm(float). Thus they have limited precision, and documents 0346 * must be re-indexed if this method is altered. 0347 * 0348 * fieldName - name of field 0349 * numTokens - the total number of tokens contained in fields named 0350 * 'fieldName' of 'doc'. 0351 * Returns a normalization factor for hits on this field of this document 0352 * 0353 * @param string $fieldName 0354 * @param integer $numTokens 0355 * @return float 0356 */ 0357 abstract public function lengthNorm($fieldName, $numTokens); 0358 0359 /** 0360 * Computes the normalization value for a query given the sum of the squared 0361 * weights of each of the query terms. This value is then multipled into the 0362 * weight of each query term. 0363 * 0364 * This does not affect ranking, but rather just attempts to make scores 0365 * from different queries comparable. 0366 * 0367 * sumOfSquaredWeights - the sum of the squares of query term weights 0368 * Returns a normalization factor for query weights 0369 * 0370 * @param float $sumOfSquaredWeights 0371 * @return float 0372 */ 0373 abstract public function queryNorm($sumOfSquaredWeights); 0374 0375 0376 /** 0377 * Decodes a normalization factor stored in an index. 0378 * 0379 * @param integer $byte 0380 * @return float 0381 */ 0382 public static function decodeNorm($byte) 0383 { 0384 return self::$_normTable[$byte & 0xFF]; 0385 } 0386 0387 0388 /** 0389 * Encodes a normalization factor for storage in an index. 0390 * 0391 * The encoding uses a five-bit exponent and three-bit mantissa, thus 0392 * representing values from around 7x10^9 to 2x10^-9 with about one 0393 * significant decimal digit of accuracy. Zero is also represented. 0394 * Negative numbers are rounded up to zero. Values too large to represent 0395 * are rounded down to the largest representable value. Positive values too 0396 * small to represent are rounded up to the smallest positive representable 0397 * value. 0398 * 0399 * @param float $f 0400 * @return integer 0401 */ 0402 static function encodeNorm($f) 0403 { 0404 return self::_floatToByte($f); 0405 } 0406 0407 /** 0408 * Float to byte conversion 0409 * 0410 * @param integer $b 0411 * @return float 0412 */ 0413 private static function _floatToByte($f) 0414 { 0415 // round negatives up to zero 0416 if ($f <= 0.0) { 0417 return 0; 0418 } 0419 0420 // search for appropriate value 0421 $lowIndex = 0; 0422 $highIndex = 255; 0423 while ($highIndex >= $lowIndex) { 0424 // $mid = ($highIndex - $lowIndex)/2; 0425 $mid = ($highIndex + $lowIndex) >> 1; 0426 $delta = $f - self::$_normTable[$mid]; 0427 0428 if ($delta < 0) { 0429 $highIndex = $mid-1; 0430 } elseif ($delta > 0) { 0431 $lowIndex = $mid+1; 0432 } else { 0433 return $mid; // We got it! 0434 } 0435 } 0436 0437 // round to closest value 0438 if ($highIndex != 255 && 0439 $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { 0440 return $highIndex + 1; 0441 } else { 0442 return $highIndex; 0443 } 0444 } 0445 0446 0447 /** 0448 * Computes a score factor based on a term or phrase's frequency in a 0449 * document. This value is multiplied by the idf(Term, Searcher) 0450 * factor for each term in the query and these products are then summed to 0451 * form the initial score for a document. 0452 * 0453 * Terms and phrases repeated in a document indicate the topic of the 0454 * document, so implementations of this method usually return larger values 0455 * when 'freq' is large, and smaller values when 'freq' 0456 * is small. 0457 * 0458 * freq - the frequency of a term within a document 0459 * Returns a score factor based on a term's within-document frequency 0460 * 0461 * @param float $freq 0462 * @return float 0463 */ 0464 abstract public function tf($freq); 0465 0466 /** 0467 * Computes the amount of a sloppy phrase match, based on an edit distance. 0468 * This value is summed for each sloppy phrase match in a document to form 0469 * the frequency that is passed to tf(float). 0470 * 0471 * A phrase match with a small edit distance to a document passage more 0472 * closely matches the document, so implementations of this method usually 0473 * return larger values when the edit distance is small and smaller values 0474 * when it is large. 0475 * 0476 * distance - the edit distance of this sloppy phrase match 0477 * Returns the frequency increment for this match 0478 * 0479 * @param integer $distance 0480 * @return float 0481 */ 0482 abstract public function sloppyFreq($distance); 0483 0484 0485 /** 0486 * Computes a score factor for a simple term or a phrase. 0487 * 0488 * The default implementation is: 0489 * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); 0490 * 0491 * input - the term in question or array of terms 0492 * reader - reader the document collection being searched 0493 * Returns a score factor for the term 0494 * 0495 * @param mixed $input 0496 * @param Zend_Search_Lucene_Interface $reader 0497 * @return a score factor for the term 0498 */ 0499 public function idf($input, Zend_Search_Lucene_Interface $reader) 0500 { 0501 if (!is_array($input)) { 0502 return $this->idfFreq($reader->docFreq($input), $reader->count()); 0503 } else { 0504 $idf = 0.0; 0505 foreach ($input as $term) { 0506 $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); 0507 } 0508 return $idf; 0509 } 0510 } 0511 0512 /** 0513 * Computes a score factor based on a term's document frequency (the number 0514 * of documents which contain the term). This value is multiplied by the 0515 * tf(int) factor for each term in the query and these products are 0516 * then summed to form the initial score for a document. 0517 * 0518 * Terms that occur in fewer documents are better indicators of topic, so 0519 * implemenations of this method usually return larger values for rare terms, 0520 * and smaller values for common terms. 0521 * 0522 * docFreq - the number of documents which contain the term 0523 * numDocs - the total number of documents in the collection 0524 * Returns a score factor based on the term's document frequency 0525 * 0526 * @param integer $docFreq 0527 * @param integer $numDocs 0528 * @return float 0529 */ 0530 abstract public function idfFreq($docFreq, $numDocs); 0531 0532 /** 0533 * Computes a score factor based on the fraction of all query terms that a 0534 * document contains. This value is multiplied into scores. 0535 * 0536 * The presence of a large portion of the query terms indicates a better 0537 * match with the query, so implemenations of this method usually return 0538 * larger values when the ratio between these parameters is large and smaller 0539 * values when the ratio between them is small. 0540 * 0541 * overlap - the number of query terms matched in the document 0542 * maxOverlap - the total number of terms in the query 0543 * Returns a score factor based on term overlap with the query 0544 * 0545 * @param integer $overlap 0546 * @param integer $maxOverlap 0547 * @return float 0548 */ 0549 abstract public function coord($overlap, $maxOverlap); 0550 } 0551