File indexing completed on 2025-03-02 05:29:43
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Index 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 /** Zend_Search_Lucene_Index_TermsStream_Interface */ 0024 // require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php'; 0025 0026 0027 /** Zend_Search_Lucene_Search_Similarity */ 0028 // require_once 'Zend/Search/Lucene/Search/Similarity.php'; 0029 0030 /** Zend_Search_Lucene_Index_FieldInfo */ 0031 // require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; 0032 0033 /** Zend_Search_Lucene_Index_Term */ 0034 // require_once 'Zend/Search/Lucene/Index/Term.php'; 0035 0036 /** Zend_Search_Lucene_Index_TermInfo */ 0037 // require_once 'Zend/Search/Lucene/Index/TermInfo.php'; 0038 0039 /** 0040 * @category Zend 0041 * @package Zend_Search_Lucene 0042 * @subpackage Index 0043 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0044 * @license http://framework.zend.com/license/new-bsd New BSD License 0045 */ 0046 class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_TermsStream_Interface 0047 { 0048 /** 0049 * "Full scan vs fetch" boundary. 0050 * 0051 * If filter selectivity is less than this value, then full scan is performed 0052 * (since term entries fetching has some additional overhead). 0053 */ 0054 const FULL_SCAN_VS_FETCH_BOUNDARY = 5; 0055 0056 /** 0057 * Number of docs in a segment 0058 * 0059 * @var integer 0060 */ 0061 private $_docCount; 0062 0063 /** 0064 * Segment name 0065 * 0066 * @var string 0067 */ 0068 private $_name; 0069 0070 /** 0071 * Term Dictionary Index 0072 * 0073 * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because 0074 * of performance considerations) 0075 * [0] -> $termValue 0076 * [1] -> $termFieldNum 0077 * 0078 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos 0079 * 0080 * @var array 0081 */ 0082 private $_termDictionary; 0083 0084 /** 0085 * Term Dictionary Index TermInfos 0086 * 0087 * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because 0088 * of performance considerations) 0089 * [0] -> $docFreq 0090 * [1] -> $freqPointer 0091 * [2] -> $proxPointer 0092 * [3] -> $skipOffset 0093 * [4] -> $indexPointer 0094 * 0095 * @var array 0096 */ 0097 private $_termDictionaryInfos; 0098 0099 /** 0100 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment 0101 * 0102 * @var array 0103 */ 0104 private $_fields; 0105 0106 /** 0107 * Field positions in a dictionary. 0108 * (Term dictionary contains filelds ordered by names) 0109 * 0110 * @var array 0111 */ 0112 private $_fieldsDicPositions; 0113 0114 0115 /** 0116 * Associative array where the key is the file name and the value is data offset 0117 * in a compound segment file (.csf). 0118 * 0119 * @var array 0120 */ 0121 private $_segFiles; 0122 0123 /** 0124 * Associative array where the key is the file name and the value is file size (.csf). 0125 * 0126 * @var array 0127 */ 0128 private $_segFileSizes; 0129 0130 /** 0131 * Delete file generation number 0132 * 0133 * -2 means autodetect latest delete generation 0134 * -1 means 'there is no delete file' 0135 * 0 means pre-2.1 format delete file 0136 * X specifies used delete file 0137 * 0138 * @var integer 0139 */ 0140 private $_delGen; 0141 0142 /** 0143 * Segment has single norms file 0144 * 0145 * If true then one .nrm file is used for all fields 0146 * Otherwise .fN files are used 0147 * 0148 * @var boolean 0149 */ 0150 private $_hasSingleNormFile; 0151 0152 /** 0153 * Use compound segment file (*.cfs) to collect all other segment files 0154 * (excluding .del files) 0155 * 0156 * @var boolean 0157 */ 0158 private $_isCompound; 0159 0160 0161 /** 0162 * File system adapter. 0163 * 0164 * @var Zend_Search_Lucene_Storage_Directory_Filesystem 0165 */ 0166 private $_directory; 0167 0168 /** 0169 * Normalization factors. 0170 * An array fieldName => normVector 0171 * normVector is a binary string. 0172 * Each byte corresponds to an indexed document in a segment and 0173 * encodes normalization factor (float value, encoded by 0174 * Zend_Search_Lucene_Search_Similarity::encodeNorm()) 0175 * 0176 * @var array 0177 */ 0178 private $_norms = array(); 0179 0180 /** 0181 * List of deleted documents. 0182 * bitset if bitset extension is loaded or array otherwise. 0183 * 0184 * @var mixed 0185 */ 0186 private $_deleted = null; 0187 0188 /** 0189 * $this->_deleted update flag 0190 * 0191 * @var boolean 0192 */ 0193 private $_deletedDirty = false; 0194 0195 /** 0196 * True if segment uses shared doc store 0197 * 0198 * @var boolean 0199 */ 0200 private $_usesSharedDocStore; 0201 0202 /* 0203 * Shared doc store options. 0204 * It's an assotiative array with the following items: 0205 * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin 0206 * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files. 0207 * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file). 0208 */ 0209 private $_sharedDocStoreOptions; 0210 0211 0212 /** 0213 * Zend_Search_Lucene_Index_SegmentInfo constructor 0214 * 0215 * @param Zend_Search_Lucene_Storage_Directory $directory 0216 * @param string $name 0217 * @param integer $docCount 0218 * @param integer $delGen 0219 * @param array|null $docStoreOptions 0220 * @param boolean $hasSingleNormFile 0221 * @param boolean $isCompound 0222 */ 0223 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null) 0224 { 0225 $this->_directory = $directory; 0226 $this->_name = $name; 0227 $this->_docCount = $docCount; 0228 0229 if ($docStoreOptions !== null) { 0230 $this->_usesSharedDocStore = true; 0231 $this->_sharedDocStoreOptions = $docStoreOptions; 0232 0233 if ($docStoreOptions['isCompound']) { 0234 $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx'); 0235 $cfxFilesCount = $cfxFile->readVInt(); 0236 0237 $cfxFiles = array(); 0238 $cfxFileSizes = array(); 0239 0240 for ($count = 0; $count < $cfxFilesCount; $count++) { 0241 $dataOffset = $cfxFile->readLong(); 0242 if ($count != 0) { 0243 $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles); 0244 } 0245 $fileName = $cfxFile->readString(); 0246 $cfxFiles[$fileName] = $dataOffset; 0247 } 0248 if ($count != 0) { 0249 $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset; 0250 } 0251 0252 $this->_sharedDocStoreOptions['files'] = $cfxFiles; 0253 $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes; 0254 } 0255 } 0256 0257 $this->_hasSingleNormFile = $hasSingleNormFile; 0258 $this->_delGen = $delGen; 0259 $this->_termDictionary = null; 0260 0261 0262 if ($isCompound !== null) { 0263 $this->_isCompound = $isCompound; 0264 } else { 0265 // It's a pre-2.1 segment or isCompound is set to 'unknown' 0266 // Detect if segment uses compound file 0267 // require_once 'Zend/Search/Lucene/Exception.php'; 0268 try { 0269 // Try to open compound file 0270 $this->_directory->getFileObject($name . '.cfs'); 0271 0272 // Compound file is found 0273 $this->_isCompound = true; 0274 } catch (Zend_Search_Lucene_Exception $e) { 0275 if (strpos($e->getMessage(), 'is not readable') !== false) { 0276 // Compound file is not found or is not readable 0277 $this->_isCompound = false; 0278 } else { 0279 throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e); 0280 } 0281 } 0282 } 0283 0284 $this->_segFiles = array(); 0285 if ($this->_isCompound) { 0286 $cfsFile = $this->_directory->getFileObject($name . '.cfs'); 0287 $segFilesCount = $cfsFile->readVInt(); 0288 0289 for ($count = 0; $count < $segFilesCount; $count++) { 0290 $dataOffset = $cfsFile->readLong(); 0291 if ($count != 0) { 0292 $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); 0293 } 0294 $fileName = $cfsFile->readString(); 0295 $this->_segFiles[$fileName] = $dataOffset; 0296 } 0297 if ($count != 0) { 0298 $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; 0299 } 0300 } 0301 0302 $fnmFile = $this->openCompoundFile('.fnm'); 0303 $fieldsCount = $fnmFile->readVInt(); 0304 $fieldNames = array(); 0305 $fieldNums = array(); 0306 $this->_fields = array(); 0307 0308 for ($count=0; $count < $fieldsCount; $count++) { 0309 $fieldName = $fnmFile->readString(); 0310 $fieldBits = $fnmFile->readByte(); 0311 $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, 0312 $fieldBits & 0x01 /* field is indexed */, 0313 $count, 0314 $fieldBits & 0x02 /* termvectors are stored */, 0315 $fieldBits & 0x10 /* norms are omitted */, 0316 $fieldBits & 0x20 /* payloads are stored */); 0317 if ($fieldBits & 0x10) { 0318 // norms are omitted for the indexed field 0319 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); 0320 } 0321 0322 $fieldNums[$count] = $count; 0323 $fieldNames[$count] = $fieldName; 0324 } 0325 array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); 0326 $this->_fieldsDicPositions = array_flip($fieldNums); 0327 0328 if ($this->_delGen == -2) { 0329 // SegmentInfo constructor is invoked from index writer 0330 // Autodetect current delete file generation number 0331 $this->_delGen = $this->_detectLatestDelGen(); 0332 } 0333 0334 // Load deletions 0335 $this->_deleted = $this->_loadDelFile(); 0336 } 0337 0338 /** 0339 * Load detetions file 0340 * 0341 * Returns bitset or an array depending on bitset extension availability 0342 * 0343 * @return mixed 0344 * @throws Zend_Search_Lucene_Exception 0345 */ 0346 private function _loadDelFile() 0347 { 0348 if ($this->_delGen == -1) { 0349 // There is no delete file for this segment 0350 return null; 0351 } else if ($this->_delGen == 0) { 0352 // It's a segment with pre-2.1 format delete file 0353 // Try to load deletions file 0354 return $this->_loadPre21DelFile(); 0355 } else { 0356 // It's 2.1+ format deleteions file 0357 return $this->_load21DelFile(); 0358 } 0359 } 0360 0361 /** 0362 * Load pre-2.1 detetions file 0363 * 0364 * Returns bitset or an array depending on bitset extension availability 0365 * 0366 * @return mixed 0367 * @throws Zend_Search_Lucene_Exception 0368 */ 0369 private function _loadPre21DelFile() 0370 { 0371 // require_once 'Zend/Search/Lucene/Exception.php'; 0372 try { 0373 // '.del' files always stored in a separate file 0374 // Segment compound is not used 0375 $delFile = $this->_directory->getFileObject($this->_name . '.del'); 0376 0377 $byteCount = $delFile->readInt(); 0378 $byteCount = ceil($byteCount/8); 0379 $bitCount = $delFile->readInt(); 0380 0381 if ($bitCount == 0) { 0382 $delBytes = ''; 0383 } else { 0384 $delBytes = $delFile->readBytes($byteCount); 0385 } 0386 0387 if (extension_loaded('bitset')) { 0388 return $delBytes; 0389 } else { 0390 $deletions = array(); 0391 for ($count = 0; $count < $byteCount; $count++) { 0392 $byte = ord($delBytes[$count]); 0393 for ($bit = 0; $bit < 8; $bit++) { 0394 if ($byte & (1<<$bit)) { 0395 $deletions[$count*8 + $bit] = 1; 0396 } 0397 } 0398 } 0399 0400 return $deletions; 0401 } 0402 } catch(Zend_Search_Lucene_Exception $e) { 0403 if (strpos($e->getMessage(), 'is not readable') === false) { 0404 throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e); 0405 } 0406 // There is no deletion file 0407 $this->_delGen = -1; 0408 0409 return null; 0410 } 0411 } 0412 0413 /** 0414 * Load 2.1+ format detetions file 0415 * 0416 * Returns bitset or an array depending on bitset extension availability 0417 * 0418 * @return mixed 0419 */ 0420 private function _load21DelFile() 0421 { 0422 $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); 0423 0424 $format = $delFile->readInt(); 0425 0426 if ($format == (int)0xFFFFFFFF) { 0427 if (extension_loaded('bitset')) { 0428 $deletions = bitset_empty(); 0429 } else { 0430 $deletions = array(); 0431 } 0432 0433 $byteCount = $delFile->readInt(); 0434 $bitCount = $delFile->readInt(); 0435 0436 $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); 0437 $byteNum = 0; 0438 0439 do { 0440 $dgap = $delFile->readVInt(); 0441 $nonZeroByte = $delFile->readByte(); 0442 0443 $byteNum += $dgap; 0444 0445 0446 if (extension_loaded('bitset')) { 0447 for ($bit = 0; $bit < 8; $bit++) { 0448 if ($nonZeroByte & (1<<$bit)) { 0449 bitset_incl($deletions, $byteNum*8 + $bit); 0450 } 0451 } 0452 return $deletions; 0453 } else { 0454 for ($bit = 0; $bit < 8; $bit++) { 0455 if ($nonZeroByte & (1<<$bit)) { 0456 $deletions[$byteNum*8 + $bit] = 1; 0457 } 0458 } 0459 return (count($deletions) > 0) ? $deletions : null; 0460 } 0461 0462 } while ($delFile->tell() < $delFileSize); 0463 } else { 0464 // $format is actually byte count 0465 $byteCount = ceil($format/8); 0466 $bitCount = $delFile->readInt(); 0467 0468 if ($bitCount == 0) { 0469 $delBytes = ''; 0470 } else { 0471 $delBytes = $delFile->readBytes($byteCount); 0472 } 0473 0474 if (extension_loaded('bitset')) { 0475 return $delBytes; 0476 } else { 0477 $deletions = array(); 0478 for ($count = 0; $count < $byteCount; $count++) { 0479 $byte = ord($delBytes[$count]); 0480 for ($bit = 0; $bit < 8; $bit++) { 0481 if ($byte & (1<<$bit)) { 0482 $deletions[$count*8 + $bit] = 1; 0483 } 0484 } 0485 } 0486 0487 return (count($deletions) > 0) ? $deletions : null; 0488 } 0489 } 0490 } 0491 0492 /** 0493 * Opens index file stoted within compound index file 0494 * 0495 * @param string $extension 0496 * @param boolean $shareHandler 0497 * @throws Zend_Search_Lucene_Exception 0498 * @return Zend_Search_Lucene_Storage_File 0499 */ 0500 public function openCompoundFile($extension, $shareHandler = true) 0501 { 0502 if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { 0503 $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx'; 0504 $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt'; 0505 0506 if (!$this->_sharedDocStoreOptions['isCompound']) { 0507 $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler); 0508 $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); 0509 0510 if ($extension == '.fdx') { 0511 // '.fdx' file is requested 0512 return $fdxFile; 0513 } else { 0514 // '.fdt' file is requested 0515 $fdtStartOffset = $fdxFile->readLong(); 0516 0517 $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler); 0518 $fdtFile->seek($fdtStartOffset, SEEK_CUR); 0519 0520 return $fdtFile; 0521 } 0522 } 0523 0524 if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) { 0525 // require_once 'Zend/Search/Lucene/Exception.php'; 0526 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain ' 0527 . $fdxFName . ' file.' ); 0528 } 0529 if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) { 0530 // require_once 'Zend/Search/Lucene/Exception.php'; 0531 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain ' 0532 . $fdtFName . ' file.' ); 0533 } 0534 0535 // Open shared docstore segment file 0536 $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler); 0537 // Seek to the start of '.fdx' file within compound file 0538 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]); 0539 // Seek to the start of current segment documents section 0540 $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); 0541 0542 if ($extension == '.fdx') { 0543 // '.fdx' file is requested 0544 return $cfxFile; 0545 } else { 0546 // '.fdt' file is requested 0547 $fdtStartOffset = $cfxFile->readLong(); 0548 0549 // Seek to the start of '.fdt' file within compound file 0550 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]); 0551 // Seek to the start of current segment documents section 0552 $cfxFile->seek($fdtStartOffset, SEEK_CUR); 0553 0554 return $fdtFile; 0555 } 0556 } 0557 0558 $filename = $this->_name . $extension; 0559 0560 if (!$this->_isCompound) { 0561 return $this->_directory->getFileObject($filename, $shareHandler); 0562 } 0563 0564 if( !isset($this->_segFiles[$filename]) ) { 0565 // require_once 'Zend/Search/Lucene/Exception.php'; 0566 throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain ' 0567 . $filename . ' file.' ); 0568 } 0569 0570 $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); 0571 $file->seek($this->_segFiles[$filename]); 0572 return $file; 0573 } 0574 0575 /** 0576 * Get compound file length 0577 * 0578 * @param string $extension 0579 * @return integer 0580 */ 0581 public function compoundFileLength($extension) 0582 { 0583 if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { 0584 $filename = $this->_sharedDocStoreOptions['segment'] . $extension; 0585 0586 if (!$this->_sharedDocStoreOptions['isCompound']) { 0587 return $this->_directory->fileLength($filename); 0588 } 0589 0590 if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) { 0591 // require_once 'Zend/Search/Lucene/Exception.php'; 0592 throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain ' 0593 . $filename . ' file.' ); 0594 } 0595 0596 return $this->_sharedDocStoreOptions['fileSizes'][$filename]; 0597 } 0598 0599 0600 $filename = $this->_name . $extension; 0601 0602 // Try to get common file first 0603 if ($this->_directory->fileExists($filename)) { 0604 return $this->_directory->fileLength($filename); 0605 } 0606 0607 if( !isset($this->_segFileSizes[$filename]) ) { 0608 // require_once 'Zend/Search/Lucene/Exception.php'; 0609 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' 0610 . $filename . ' file.' ); 0611 } 0612 0613 return $this->_segFileSizes[$filename]; 0614 } 0615 0616 /** 0617 * Returns field index or -1 if field is not found 0618 * 0619 * @param string $fieldName 0620 * @return integer 0621 */ 0622 public function getFieldNum($fieldName) 0623 { 0624 foreach( $this->_fields as $field ) { 0625 if( $field->name == $fieldName ) { 0626 return $field->number; 0627 } 0628 } 0629 0630 return -1; 0631 } 0632 0633 /** 0634 * Returns field info for specified field 0635 * 0636 * @param integer $fieldNum 0637 * @return Zend_Search_Lucene_Index_FieldInfo 0638 */ 0639 public function getField($fieldNum) 0640 { 0641 return $this->_fields[$fieldNum]; 0642 } 0643 0644 /** 0645 * Returns array of fields. 0646 * if $indexed parameter is true, then returns only indexed fields. 0647 * 0648 * @param boolean $indexed 0649 * @return array 0650 */ 0651 public function getFields($indexed = false) 0652 { 0653 $result = array(); 0654 foreach( $this->_fields as $field ) { 0655 if( (!$indexed) || $field->isIndexed ) { 0656 $result[ $field->name ] = $field->name; 0657 } 0658 } 0659 return $result; 0660 } 0661 0662 /** 0663 * Returns array of FieldInfo objects. 0664 * 0665 * @return array 0666 */ 0667 public function getFieldInfos() 0668 { 0669 return $this->_fields; 0670 } 0671 0672 /** 0673 * Returns actual deletions file generation number. 0674 * 0675 * @return integer 0676 */ 0677 public function getDelGen() 0678 { 0679 return $this->_delGen; 0680 } 0681 0682 /** 0683 * Returns the total number of documents in this segment (including deleted documents). 0684 * 0685 * @return integer 0686 */ 0687 public function count() 0688 { 0689 return $this->_docCount; 0690 } 0691 0692 /** 0693 * Returns number of deleted documents. 0694 * 0695 * @return integer 0696 */ 0697 private function _deletedCount() 0698 { 0699 if ($this->_deleted === null) { 0700 return 0; 0701 } 0702 0703 if (extension_loaded('bitset')) { 0704 return count(bitset_to_array($this->_deleted)); 0705 } else { 0706 return count($this->_deleted); 0707 } 0708 } 0709 0710 /** 0711 * Returns the total number of non-deleted documents in this segment. 0712 * 0713 * @return integer 0714 */ 0715 public function numDocs() 0716 { 0717 if ($this->hasDeletions()) { 0718 return $this->_docCount - $this->_deletedCount(); 0719 } else { 0720 return $this->_docCount; 0721 } 0722 } 0723 0724 /** 0725 * Get field position in a fields dictionary 0726 * 0727 * @param integer $fieldNum 0728 * @return integer 0729 */ 0730 private function _getFieldPosition($fieldNum) { 0731 // Treat values which are not in a translation table as a 'direct value' 0732 return isset($this->_fieldsDicPositions[$fieldNum]) ? 0733 $this->_fieldsDicPositions[$fieldNum] : $fieldNum; 0734 } 0735 0736 /** 0737 * Return segment name 0738 * 0739 * @return string 0740 */ 0741 public function getName() 0742 { 0743 return $this->_name; 0744 } 0745 0746 0747 /** 0748 * TermInfo cache 0749 * 0750 * Size is 1024. 0751 * Numbers are used instead of class constants because of performance considerations 0752 * 0753 * @var array 0754 */ 0755 private $_termInfoCache = array(); 0756 0757 private function _cleanUpTermInfoCache() 0758 { 0759 // Clean 256 term infos 0760 foreach ($this->_termInfoCache as $key => $termInfo) { 0761 unset($this->_termInfoCache[$key]); 0762 0763 // leave 768 last used term infos 0764 if (count($this->_termInfoCache) == 768) { 0765 break; 0766 } 0767 } 0768 } 0769 0770 /** 0771 * Load terms dictionary index 0772 * 0773 * @throws Zend_Search_Lucene_Exception 0774 */ 0775 private function _loadDictionaryIndex() 0776 { 0777 // Check, if index is already serialized 0778 if ($this->_directory->fileExists($this->_name . '.sti')) { 0779 // Load serialized dictionary index data 0780 $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); 0781 $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); 0782 0783 // Load dictionary index data 0784 if (($unserializedData = @unserialize($stiFileData)) !== false) { 0785 list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData; 0786 return; 0787 } 0788 } 0789 0790 // Load data from .tii file and generate .sti file 0791 0792 // Prefetch dictionary index data 0793 $tiiFile = $this->openCompoundFile('.tii'); 0794 $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); 0795 0796 /** Zend_Search_Lucene_Index_DictionaryLoader */ 0797 // require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php'; 0798 0799 // Load dictionary index data 0800 list($this->_termDictionary, $this->_termDictionaryInfos) = 0801 Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); 0802 0803 $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); 0804 $stiFile = $this->_directory->createFile($this->_name . '.sti'); 0805 $stiFile->writeBytes($stiFileData); 0806 } 0807 0808 /** 0809 * Scans terms dictionary and returns term info 0810 * 0811 * @param Zend_Search_Lucene_Index_Term $term 0812 * @return Zend_Search_Lucene_Index_TermInfo 0813 */ 0814 public function getTermInfo(Zend_Search_Lucene_Index_Term $term) 0815 { 0816 $termKey = $term->key(); 0817 if (isset($this->_termInfoCache[$termKey])) { 0818 $termInfo = $this->_termInfoCache[$termKey]; 0819 0820 // Move termInfo to the end of cache 0821 unset($this->_termInfoCache[$termKey]); 0822 $this->_termInfoCache[$termKey] = $termInfo; 0823 0824 return $termInfo; 0825 } 0826 0827 0828 if ($this->_termDictionary === null) { 0829 $this->_loadDictionaryIndex(); 0830 } 0831 0832 $searchField = $this->getFieldNum($term->field); 0833 0834 if ($searchField == -1) { 0835 return null; 0836 } 0837 $searchDicField = $this->_getFieldPosition($searchField); 0838 0839 // search for appropriate value in dictionary 0840 $lowIndex = 0; 0841 $highIndex = count($this->_termDictionary)-1; 0842 while ($highIndex >= $lowIndex) { 0843 // $mid = ($highIndex - $lowIndex)/2; 0844 $mid = ($highIndex + $lowIndex) >> 1; 0845 $midTerm = $this->_termDictionary[$mid]; 0846 0847 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); 0848 $delta = $searchDicField - $fieldNum; 0849 if ($delta == 0) { 0850 $delta = strcmp($term->text, $midTerm[1] /* text */); 0851 } 0852 0853 if ($delta < 0) { 0854 $highIndex = $mid-1; 0855 } elseif ($delta > 0) { 0856 $lowIndex = $mid+1; 0857 } else { 0858 // return $this->_termDictionaryInfos[$mid]; // We got it! 0859 $a = $this->_termDictionaryInfos[$mid]; 0860 $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); 0861 0862 // Put loaded termInfo into cache 0863 $this->_termInfoCache[$termKey] = $termInfo; 0864 0865 return $termInfo; 0866 } 0867 } 0868 0869 if ($highIndex == -1) { 0870 // Term is out of the dictionary range 0871 return null; 0872 } 0873 0874 $prevPosition = $highIndex; 0875 $prevTerm = $this->_termDictionary[$prevPosition]; 0876 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; 0877 0878 $tisFile = $this->openCompoundFile('.tis'); 0879 $tiVersion = $tisFile->readInt(); 0880 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && 0881 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { 0882 // require_once 'Zend/Search/Lucene/Exception.php'; 0883 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); 0884 } 0885 0886 $termCount = $tisFile->readLong(); 0887 $indexInterval = $tisFile->readInt(); 0888 $skipInterval = $tisFile->readInt(); 0889 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { 0890 $maxSkipLevels = $tisFile->readInt(); 0891 } 0892 0893 $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR); 0894 0895 $termValue = $prevTerm[1] /* text */; 0896 $termFieldNum = $prevTerm[0] /* field */; 0897 $freqPointer = $prevTermInfo[1] /* freqPointer */; 0898 $proxPointer = $prevTermInfo[2] /* proxPointer */; 0899 for ($count = $prevPosition*$indexInterval + 1; 0900 $count <= $termCount && 0901 ( $this->_getFieldPosition($termFieldNum) < $searchDicField || 0902 ($this->_getFieldPosition($termFieldNum) == $searchDicField && 0903 strcmp($termValue, $term->text) < 0) ); 0904 $count++) { 0905 $termPrefixLength = $tisFile->readVInt(); 0906 $termSuffix = $tisFile->readString(); 0907 $termFieldNum = $tisFile->readVInt(); 0908 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; 0909 0910 $docFreq = $tisFile->readVInt(); 0911 $freqPointer += $tisFile->readVInt(); 0912 $proxPointer += $tisFile->readVInt(); 0913 if( $docFreq >= $skipInterval ) { 0914 $skipOffset = $tisFile->readVInt(); 0915 } else { 0916 $skipOffset = 0; 0917 } 0918 } 0919 0920 if ($termFieldNum == $searchField && $termValue == $term->text) { 0921 $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); 0922 } else { 0923 $termInfo = null; 0924 } 0925 0926 // Put loaded termInfo into cache 0927 $this->_termInfoCache[$termKey] = $termInfo; 0928 0929 if (count($this->_termInfoCache) == 1024) { 0930 $this->_cleanUpTermInfoCache(); 0931 } 0932 0933 return $termInfo; 0934 } 0935 0936 /** 0937 * Returns IDs of all the documents containing term. 0938 * 0939 * @param Zend_Search_Lucene_Index_Term $term 0940 * @param integer $shift 0941 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter 0942 * @return array 0943 */ 0944 public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) 0945 { 0946 $termInfo = $this->getTermInfo($term); 0947 0948 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { 0949 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { 0950 $docsFilter->segmentFilters[$this->_name] = array(); 0951 } 0952 return array(); 0953 } 0954 0955 $frqFile = $this->openCompoundFile('.frq'); 0956 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); 0957 $docId = 0; 0958 $result = array(); 0959 0960 if ($docsFilter !== null) { 0961 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { 0962 // require_once 'Zend/Search/Lucene/Exception.php'; 0963 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); 0964 } 0965 0966 if (isset($docsFilter->segmentFilters[$this->_name])) { 0967 // Filter already has some data for the current segment 0968 0969 // Make short name for the filter (which doesn't need additional dereferencing) 0970 $filter = &$docsFilter->segmentFilters[$this->_name]; 0971 0972 // Check if filter is not empty 0973 if (count($filter) == 0) { 0974 return array(); 0975 } 0976 0977 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { 0978 // Perform fetching 0979 // --------------------------------------------------------------- 0980 $updatedFilterData = array(); 0981 0982 for( $count=0; $count < $termInfo->docFreq; $count++ ) { 0983 $docDelta = $frqFile->readVInt(); 0984 if( $docDelta % 2 == 1 ) { 0985 $docId += ($docDelta-1)/2; 0986 } else { 0987 $docId += $docDelta/2; 0988 // read freq 0989 $frqFile->readVInt(); 0990 } 0991 0992 if (isset($filter[$docId])) { 0993 $result[] = $shift + $docId; 0994 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 0995 } 0996 } 0997 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; 0998 // --------------------------------------------------------------- 0999 } else { 1000 // Perform full scan 1001 $updatedFilterData = array(); 1002 1003 for( $count=0; $count < $termInfo->docFreq; $count++ ) { 1004 $docDelta = $frqFile->readVInt(); 1005 if( $docDelta % 2 == 1 ) { 1006 $docId += ($docDelta-1)/2; 1007 } else { 1008 $docId += $docDelta/2; 1009 // read freq 1010 $frqFile->readVInt(); 1011 } 1012 1013 if (isset($filter[$docId])) { 1014 $result[] = $shift + $docId; 1015 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1016 } 1017 } 1018 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; 1019 } 1020 } else { 1021 // Filter is present, but doesn't has data for the current segment yet 1022 $filterData = array(); 1023 for( $count=0; $count < $termInfo->docFreq; $count++ ) { 1024 $docDelta = $frqFile->readVInt(); 1025 if( $docDelta % 2 == 1 ) { 1026 $docId += ($docDelta-1)/2; 1027 } else { 1028 $docId += $docDelta/2; 1029 // read freq 1030 $frqFile->readVInt(); 1031 } 1032 1033 $result[] = $shift + $docId; 1034 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1035 } 1036 $docsFilter->segmentFilters[$this->_name] = $filterData; 1037 } 1038 } else { 1039 for( $count=0; $count < $termInfo->docFreq; $count++ ) { 1040 $docDelta = $frqFile->readVInt(); 1041 if( $docDelta % 2 == 1 ) { 1042 $docId += ($docDelta-1)/2; 1043 } else { 1044 $docId += $docDelta/2; 1045 // read freq 1046 $frqFile->readVInt(); 1047 } 1048 1049 $result[] = $shift + $docId; 1050 } 1051 } 1052 1053 return $result; 1054 } 1055 1056 /** 1057 * Returns term freqs array. 1058 * Result array structure: array(docId => freq, ...) 1059 * 1060 * @param Zend_Search_Lucene_Index_Term $term 1061 * @param integer $shift 1062 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter 1063 * @return Zend_Search_Lucene_Index_TermInfo 1064 */ 1065 public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) 1066 { 1067 $termInfo = $this->getTermInfo($term); 1068 1069 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { 1070 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { 1071 $docsFilter->segmentFilters[$this->_name] = array(); 1072 } 1073 return array(); 1074 } 1075 1076 $frqFile = $this->openCompoundFile('.frq'); 1077 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); 1078 $result = array(); 1079 $docId = 0; 1080 1081 $result = array(); 1082 1083 if ($docsFilter !== null) { 1084 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { 1085 // require_once 'Zend/Search/Lucene/Exception.php'; 1086 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); 1087 } 1088 1089 if (isset($docsFilter->segmentFilters[$this->_name])) { 1090 // Filter already has some data for the current segment 1091 1092 // Make short name for the filter (which doesn't need additional dereferencing) 1093 $filter = &$docsFilter->segmentFilters[$this->_name]; 1094 1095 // Check if filter is not empty 1096 if (count($filter) == 0) { 1097 return array(); 1098 } 1099 1100 1101 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { 1102 // Perform fetching 1103 // --------------------------------------------------------------- 1104 $updatedFilterData = array(); 1105 1106 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1107 $docDelta = $frqFile->readVInt(); 1108 if ($docDelta % 2 == 1) { 1109 $docId += ($docDelta-1)/2; 1110 if (isset($filter[$docId])) { 1111 $result[$shift + $docId] = 1; 1112 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1113 } 1114 } else { 1115 $docId += $docDelta/2; 1116 $freq = $frqFile->readVInt(); 1117 if (isset($filter[$docId])) { 1118 $result[$shift + $docId] = $freq; 1119 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1120 } 1121 } 1122 } 1123 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; 1124 // --------------------------------------------------------------- 1125 } else { 1126 // Perform full scan 1127 $updatedFilterData = array(); 1128 1129 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1130 $docDelta = $frqFile->readVInt(); 1131 if ($docDelta % 2 == 1) { 1132 $docId += ($docDelta-1)/2; 1133 if (isset($filter[$docId])) { 1134 $result[$shift + $docId] = 1; 1135 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here 1136 } 1137 } else { 1138 $docId += $docDelta/2; 1139 $freq = $frqFile->readVInt(); 1140 if (isset($filter[$docId])) { 1141 $result[$shift + $docId] = $freq; 1142 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here 1143 } 1144 } 1145 } 1146 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; 1147 } 1148 } else { 1149 // Filter doesn't has data for current segment 1150 $filterData = array(); 1151 1152 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1153 $docDelta = $frqFile->readVInt(); 1154 if ($docDelta % 2 == 1) { 1155 $docId += ($docDelta-1)/2; 1156 $result[$shift + $docId] = 1; 1157 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1158 } else { 1159 $docId += $docDelta/2; 1160 $result[$shift + $docId] = $frqFile->readVInt(); 1161 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1162 } 1163 } 1164 1165 $docsFilter->segmentFilters[$this->_name] = $filterData; 1166 } 1167 } else { 1168 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1169 $docDelta = $frqFile->readVInt(); 1170 if ($docDelta % 2 == 1) { 1171 $docId += ($docDelta-1)/2; 1172 $result[$shift + $docId] = 1; 1173 } else { 1174 $docId += $docDelta/2; 1175 $result[$shift + $docId] = $frqFile->readVInt(); 1176 } 1177 } 1178 } 1179 1180 return $result; 1181 } 1182 1183 /** 1184 * Returns term positions array. 1185 * Result array structure: array(docId => array(pos1, pos2, ...), ...) 1186 * 1187 * @param Zend_Search_Lucene_Index_Term $term 1188 * @param integer $shift 1189 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter 1190 * @return Zend_Search_Lucene_Index_TermInfo 1191 */ 1192 public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) 1193 { 1194 $termInfo = $this->getTermInfo($term); 1195 1196 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { 1197 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { 1198 $docsFilter->segmentFilters[$this->_name] = array(); 1199 } 1200 return array(); 1201 } 1202 1203 $frqFile = $this->openCompoundFile('.frq'); 1204 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); 1205 1206 $docId = 0; 1207 $freqs = array(); 1208 1209 1210 if ($docsFilter !== null) { 1211 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { 1212 // require_once 'Zend/Search/Lucene/Exception.php'; 1213 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); 1214 } 1215 1216 if (isset($docsFilter->segmentFilters[$this->_name])) { 1217 // Filter already has some data for the current segment 1218 1219 // Make short name for the filter (which doesn't need additional dereferencing) 1220 $filter = &$docsFilter->segmentFilters[$this->_name]; 1221 1222 // Check if filter is not empty 1223 if (count($filter) == 0) { 1224 return array(); 1225 } 1226 1227 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { 1228 // Perform fetching 1229 // --------------------------------------------------------------- 1230 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1231 $docDelta = $frqFile->readVInt(); 1232 if ($docDelta % 2 == 1) { 1233 $docId += ($docDelta-1)/2; 1234 $freqs[$docId] = 1; 1235 } else { 1236 $docId += $docDelta/2; 1237 $freqs[$docId] = $frqFile->readVInt(); 1238 } 1239 } 1240 1241 $updatedFilterData = array(); 1242 $result = array(); 1243 $prxFile = $this->openCompoundFile('.prx'); 1244 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); 1245 foreach ($freqs as $docId => $freq) { 1246 $termPosition = 0; 1247 $positions = array(); 1248 1249 // we have to read .prx file to get right position for next doc 1250 // even filter doesn't match current document 1251 for ($count = 0; $count < $freq; $count++ ) { 1252 $termPosition += $prxFile->readVInt(); 1253 $positions[] = $termPosition; 1254 } 1255 1256 // Include into updated filter and into result only if doc is matched by filter 1257 if (isset($filter[$docId])) { 1258 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1259 $result[$shift + $docId] = $positions; 1260 } 1261 } 1262 1263 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; 1264 // --------------------------------------------------------------- 1265 } else { 1266 // Perform full scan 1267 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1268 $docDelta = $frqFile->readVInt(); 1269 if ($docDelta % 2 == 1) { 1270 $docId += ($docDelta-1)/2; 1271 $freqs[$docId] = 1; 1272 } else { 1273 $docId += $docDelta/2; 1274 $freqs[$docId] = $frqFile->readVInt(); 1275 } 1276 } 1277 1278 $updatedFilterData = array(); 1279 $result = array(); 1280 $prxFile = $this->openCompoundFile('.prx'); 1281 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); 1282 foreach ($freqs as $docId => $freq) { 1283 $termPosition = 0; 1284 $positions = array(); 1285 1286 // we have to read .prx file to get right position for next doc 1287 // even filter doesn't match current document 1288 for ($count = 0; $count < $freq; $count++ ) { 1289 $termPosition += $prxFile->readVInt(); 1290 $positions[] = $termPosition; 1291 } 1292 1293 // Include into updated filter and into result only if doc is matched by filter 1294 if (isset($filter[$docId])) { 1295 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1296 $result[$shift + $docId] = $positions; 1297 } 1298 } 1299 1300 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; 1301 } 1302 } else { 1303 // Filter doesn't has data for current segment 1304 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1305 $docDelta = $frqFile->readVInt(); 1306 if ($docDelta % 2 == 1) { 1307 $docId += ($docDelta-1)/2; 1308 $freqs[$docId] = 1; 1309 } else { 1310 $docId += $docDelta/2; 1311 $freqs[$docId] = $frqFile->readVInt(); 1312 } 1313 } 1314 1315 $filterData = array(); 1316 $result = array(); 1317 $prxFile = $this->openCompoundFile('.prx'); 1318 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); 1319 foreach ($freqs as $docId => $freq) { 1320 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here 1321 1322 $termPosition = 0; 1323 $positions = array(); 1324 1325 for ($count = 0; $count < $freq; $count++ ) { 1326 $termPosition += $prxFile->readVInt(); 1327 $positions[] = $termPosition; 1328 } 1329 1330 $result[$shift + $docId] = $positions; 1331 } 1332 1333 $docsFilter->segmentFilters[$this->_name] = $filterData; 1334 } 1335 } else { 1336 for ($count = 0; $count < $termInfo->docFreq; $count++) { 1337 $docDelta = $frqFile->readVInt(); 1338 if ($docDelta % 2 == 1) { 1339 $docId += ($docDelta-1)/2; 1340 $freqs[$docId] = 1; 1341 } else { 1342 $docId += $docDelta/2; 1343 $freqs[$docId] = $frqFile->readVInt(); 1344 } 1345 } 1346 1347 $result = array(); 1348 $prxFile = $this->openCompoundFile('.prx'); 1349 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); 1350 foreach ($freqs as $docId => $freq) { 1351 $termPosition = 0; 1352 $positions = array(); 1353 1354 for ($count = 0; $count < $freq; $count++ ) { 1355 $termPosition += $prxFile->readVInt(); 1356 $positions[] = $termPosition; 1357 } 1358 1359 $result[$shift + $docId] = $positions; 1360 } 1361 } 1362 1363 return $result; 1364 } 1365 1366 /** 1367 * Load normalizatin factors from an index file 1368 * 1369 * @param integer $fieldNum 1370 * @throws Zend_Search_Lucene_Exception 1371 */ 1372 private function _loadNorm($fieldNum) 1373 { 1374 if ($this->_hasSingleNormFile) { 1375 $normfFile = $this->openCompoundFile('.nrm'); 1376 1377 $header = $normfFile->readBytes(3); 1378 $headerFormatVersion = $normfFile->readByte(); 1379 1380 if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) { 1381 // require_once 'Zend/Search/Lucene/Exception.php'; 1382 throw new Zend_Search_Lucene_Exception('Wrong norms file format.'); 1383 } 1384 1385 foreach ($this->_fields as $fNum => $fieldInfo) { 1386 if ($fieldInfo->isIndexed) { 1387 $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount); 1388 } 1389 } 1390 } else { 1391 $fFile = $this->openCompoundFile('.f' . $fieldNum); 1392 $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); 1393 } 1394 } 1395 1396 /** 1397 * Returns normalization factor for specified documents 1398 * 1399 * @param integer $id 1400 * @param string $fieldName 1401 * @return float 1402 */ 1403 public function norm($id, $fieldName) 1404 { 1405 $fieldNum = $this->getFieldNum($fieldName); 1406 1407 if ( !($this->_fields[$fieldNum]->isIndexed) ) { 1408 return null; 1409 } 1410 1411 if (!isset($this->_norms[$fieldNum])) { 1412 $this->_loadNorm($fieldNum); 1413 } 1414 1415 return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) ); 1416 } 1417 1418 /** 1419 * Returns norm vector, encoded in a byte string 1420 * 1421 * @param string $fieldName 1422 * @return string 1423 */ 1424 public function normVector($fieldName) 1425 { 1426 $fieldNum = $this->getFieldNum($fieldName); 1427 1428 if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { 1429 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); 1430 1431 return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), 1432 $this->_docCount); 1433 } 1434 1435 if (!isset($this->_norms[$fieldNum])) { 1436 $this->_loadNorm($fieldNum); 1437 } 1438 1439 return $this->_norms[$fieldNum]; 1440 } 1441 1442 1443 /** 1444 * Returns true if any documents have been deleted from this index segment. 1445 * 1446 * @return boolean 1447 */ 1448 public function hasDeletions() 1449 { 1450 return $this->_deleted !== null; 1451 } 1452 1453 1454 /** 1455 * Returns true if segment has single norms file. 1456 * 1457 * @return boolean 1458 */ 1459 public function hasSingleNormFile() 1460 { 1461 return $this->_hasSingleNormFile ? true : false; 1462 } 1463 1464 /** 1465 * Returns true if segment is stored using compound segment file. 1466 * 1467 * @return boolean 1468 */ 1469 public function isCompound() 1470 { 1471 return $this->_isCompound; 1472 } 1473 1474 /** 1475 * Deletes a document from the index segment. 1476 * $id is an internal document id 1477 * 1478 * @param integer 1479 */ 1480 public function delete($id) 1481 { 1482 $this->_deletedDirty = true; 1483 1484 if (extension_loaded('bitset')) { 1485 if ($this->_deleted === null) { 1486 $this->_deleted = bitset_empty($id); 1487 } 1488 bitset_incl($this->_deleted, $id); 1489 } else { 1490 if ($this->_deleted === null) { 1491 $this->_deleted = array(); 1492 } 1493 1494 $this->_deleted[$id] = 1; 1495 } 1496 } 1497 1498 /** 1499 * Checks, that document is deleted 1500 * 1501 * @param integer 1502 * @return boolean 1503 */ 1504 public function isDeleted($id) 1505 { 1506 if ($this->_deleted === null) { 1507 return false; 1508 } 1509 1510 if (extension_loaded('bitset')) { 1511 return bitset_in($this->_deleted, $id); 1512 } else { 1513 return isset($this->_deleted[$id]); 1514 } 1515 } 1516 1517 /** 1518 * Detect latest delete generation 1519 * 1520 * Is actualy used from writeChanges() method or from the constructor if it's invoked from 1521 * Index writer. In both cases index write lock is already obtained, so we shouldn't care 1522 * about it 1523 * 1524 * @return integer 1525 */ 1526 private function _detectLatestDelGen() 1527 { 1528 $delFileList = array(); 1529 foreach ($this->_directory->fileList() as $file) { 1530 if ($file == $this->_name . '.del') { 1531 // Matches <segment_name>.del file name 1532 $delFileList[] = 0; 1533 } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) { 1534 // Matches <segment_name>_NNN.del file names 1535 $delFileList[] = (int)base_convert($matches[1], 36, 10); 1536 } 1537 } 1538 1539 if (count($delFileList) == 0) { 1540 // There is no deletions file for current segment in the directory 1541 // Set deletions file generation number to 1 1542 return -1; 1543 } else { 1544 // There are some deletions files for current segment in the directory 1545 // Set deletions file generation number to the highest nuber 1546 return max($delFileList); 1547 } 1548 } 1549 1550 /** 1551 * Write changes if it's necessary. 1552 * 1553 * This method must be invoked only from the Writer _updateSegments() method, 1554 * so index Write lock has to be already obtained. 1555 * 1556 * @internal 1557 * @throws Zend_Search_Lucene_Exceptions 1558 */ 1559 public function writeChanges() 1560 { 1561 // Get new generation number 1562 $latestDelGen = $this->_detectLatestDelGen(); 1563 1564 if (!$this->_deletedDirty) { 1565 // There was no deletions by current process 1566 1567 if ($latestDelGen == $this->_delGen) { 1568 // Delete file hasn't been updated by any concurrent process 1569 return; 1570 } else if ($latestDelGen > $this->_delGen) { 1571 // Delete file has been updated by some concurrent process 1572 // Reload deletions file 1573 $this->_delGen = $latestDelGen; 1574 $this->_deleted = $this->_loadDelFile(); 1575 1576 return; 1577 } else { 1578 // require_once 'Zend/Search/Lucene/Exception.php'; 1579 throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.'); 1580 } 1581 } 1582 1583 if ($latestDelGen > $this->_delGen) { 1584 // Merge current deletions with latest deletions file 1585 $this->_delGen = $latestDelGen; 1586 1587 $latestDelete = $this->_loadDelFile(); 1588 1589 if (extension_loaded('bitset')) { 1590 $this->_deleted = bitset_union($this->_deleted, $latestDelete); 1591 } else { 1592 $this->_deleted += $latestDelete; 1593 } 1594 } 1595 1596 if (extension_loaded('bitset')) { 1597 $delBytes = $this->_deleted; 1598 $bitCount = count(bitset_to_array($delBytes)); 1599 } else { 1600 $byteCount = floor($this->_docCount/8)+1; 1601 $delBytes = str_repeat(chr(0), $byteCount); 1602 for ($count = 0; $count < $byteCount; $count++) { 1603 $byte = 0; 1604 for ($bit = 0; $bit < 8; $bit++) { 1605 if (isset($this->_deleted[$count*8 + $bit])) { 1606 $byte |= (1<<$bit); 1607 } 1608 } 1609 $delBytes[$count] = chr($byte); 1610 } 1611 $bitCount = count($this->_deleted); 1612 } 1613 1614 if ($this->_delGen == -1) { 1615 // Set delete file generation number to 1 1616 $this->_delGen = 1; 1617 } else { 1618 // Increase delete file generation number by 1 1619 $this->_delGen++; 1620 } 1621 1622 $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); 1623 $delFile->writeInt($this->_docCount); 1624 $delFile->writeInt($bitCount); 1625 $delFile->writeBytes($delBytes); 1626 1627 $this->_deletedDirty = false; 1628 } 1629 1630 1631 /** 1632 * Term Dictionary File object for stream like terms reading 1633 * 1634 * @var Zend_Search_Lucene_Storage_File 1635 */ 1636 private $_tisFile = null; 1637 1638 /** 1639 * Actual offset of the .tis file data 1640 * 1641 * @var integer 1642 */ 1643 private $_tisFileOffset; 1644 1645 /** 1646 * Frequencies File object for stream like terms reading 1647 * 1648 * @var Zend_Search_Lucene_Storage_File 1649 */ 1650 private $_frqFile = null; 1651 1652 /** 1653 * Actual offset of the .frq file data 1654 * 1655 * @var integer 1656 */ 1657 private $_frqFileOffset; 1658 1659 /** 1660 * Positions File object for stream like terms reading 1661 * 1662 * @var Zend_Search_Lucene_Storage_File 1663 */ 1664 private $_prxFile = null; 1665 1666 /** 1667 * Actual offset of the .prx file in the compound file 1668 * 1669 * @var integer 1670 */ 1671 private $_prxFileOffset; 1672 1673 1674 /** 1675 * Actual number of terms in term stream 1676 * 1677 * @var integer 1678 */ 1679 private $_termCount = 0; 1680 1681 /** 1682 * Overall number of terms in term stream 1683 * 1684 * @var integer 1685 */ 1686 private $_termNum = 0; 1687 1688 /** 1689 * Segment index interval 1690 * 1691 * @var integer 1692 */ 1693 private $_indexInterval; 1694 1695 /** 1696 * Segment skip interval 1697 * 1698 * @var integer 1699 */ 1700 private $_skipInterval; 1701 1702 /** 1703 * Last TermInfo in a terms stream 1704 * 1705 * @var Zend_Search_Lucene_Index_TermInfo 1706 */ 1707 private $_lastTermInfo = null; 1708 1709 /** 1710 * Last Term in a terms stream 1711 * 1712 * @var Zend_Search_Lucene_Index_Term 1713 */ 1714 private $_lastTerm = null; 1715 1716 /** 1717 * Map of the document IDs 1718 * Used to get new docID after removing deleted documents. 1719 * It's not very effective from memory usage point of view, 1720 * but much more faster, then other methods 1721 * 1722 * @var array|null 1723 */ 1724 private $_docMap = null; 1725 1726 /** 1727 * An array of all term positions in the documents. 1728 * Array structure: array( docId => array( pos1, pos2, ...), ...) 1729 * 1730 * Is set to null if term positions loading has to be skipped 1731 * 1732 * @var array|null 1733 */ 1734 private $_lastTermPositions; 1735 1736 1737 /** 1738 * Terms scan mode 1739 * 1740 * Values: 1741 * 1742 * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved 1743 * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved 1744 * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved 1745 * document numbers are compacted (shifted if segment has deleted documents) 1746 * 1747 * @var integer 1748 */ 1749 private $_termsScanMode; 1750 1751 /** Scan modes */ 1752 const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved 1753 const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved 1754 const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved 1755 // document numbers are compacted (shifted if segment contains deleted documents) 1756 1757 /** 1758 * Reset terms stream 1759 * 1760 * $startId - id for the fist document 1761 * $compact - remove deleted documents 1762 * 1763 * Returns start document id for the next segment 1764 * 1765 * @param integer $startId 1766 * @param integer $mode 1767 * @throws Zend_Search_Lucene_Exception 1768 * @return integer 1769 */ 1770 public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */) 1771 { 1772 /** 1773 * SegmentInfo->resetTermsStream() method actually takes two optional parameters: 1774 * $startId (default value is 0) 1775 * $mode (default value is self::SM_TERMS_ONLY) 1776 */ 1777 $argList = func_get_args(); 1778 if (count($argList) > 2) { 1779 // require_once 'Zend/Search/Lucene/Exception.php'; 1780 throw new Zend_Search_Lucene_Exception('Wrong number of arguments'); 1781 } else if (count($argList) == 2) { 1782 $startId = $argList[0]; 1783 $mode = $argList[1]; 1784 } else if (count($argList) == 1) { 1785 $startId = $argList[0]; 1786 $mode = self::SM_TERMS_ONLY; 1787 } else { 1788 $startId = 0; 1789 $mode = self::SM_TERMS_ONLY; 1790 } 1791 1792 if ($this->_tisFile !== null) { 1793 $this->_tisFile = null; 1794 } 1795 1796 $this->_tisFile = $this->openCompoundFile('.tis', false); 1797 $this->_tisFileOffset = $this->_tisFile->tell(); 1798 1799 $tiVersion = $this->_tisFile->readInt(); 1800 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && 1801 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { 1802 // require_once 'Zend/Search/Lucene/Exception.php'; 1803 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); 1804 } 1805 1806 $this->_termCount = 1807 $this->_termNum = $this->_tisFile->readLong(); // Read terms count 1808 $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval 1809 $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval 1810 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { 1811 $maxSkipLevels = $this->_tisFile->readInt(); 1812 } 1813 1814 if ($this->_frqFile !== null) { 1815 $this->_frqFile = null; 1816 } 1817 if ($this->_prxFile !== null) { 1818 $this->_prxFile = null; 1819 } 1820 $this->_docMap = array(); 1821 1822 $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); 1823 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); 1824 $this->_lastTermPositions = null; 1825 1826 $this->_termsScanMode = $mode; 1827 1828 switch ($mode) { 1829 case self::SM_TERMS_ONLY: 1830 // Do nothing 1831 break; 1832 1833 case self::SM_FULL_INFO: 1834 // break intentionally omitted 1835 case self::SM_MERGE_INFO: 1836 $this->_frqFile = $this->openCompoundFile('.frq', false); 1837 $this->_frqFileOffset = $this->_frqFile->tell(); 1838 1839 $this->_prxFile = $this->openCompoundFile('.prx', false); 1840 $this->_prxFileOffset = $this->_prxFile->tell(); 1841 1842 for ($count = 0; $count < $this->_docCount; $count++) { 1843 if (!$this->isDeleted($count)) { 1844 $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count); 1845 } 1846 } 1847 break; 1848 1849 default: 1850 // require_once 'Zend/Search/Lucene/Exception.php'; 1851 throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.'); 1852 break; 1853 } 1854 1855 // Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call) 1856 $nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount); 1857 $this->nextTerm(); 1858 1859 return $nextSegmentStartId; 1860 } 1861 1862 1863 /** 1864 * Skip terms stream up to the specified term preffix. 1865 * 1866 * Prefix contains fully specified field info and portion of searched term 1867 * 1868 * @param Zend_Search_Lucene_Index_Term $prefix 1869 * @throws Zend_Search_Lucene_Exception 1870 */ 1871 public function skipTo(Zend_Search_Lucene_Index_Term $prefix) 1872 { 1873 if ($this->_termDictionary === null) { 1874 $this->_loadDictionaryIndex(); 1875 } 1876 1877 $searchField = $this->getFieldNum($prefix->field); 1878 1879 if ($searchField == -1) { 1880 /** 1881 * Field is not presented in this segment 1882 * Go to the end of dictionary 1883 */ 1884 $this->_tisFile = null; 1885 $this->_frqFile = null; 1886 $this->_prxFile = null; 1887 1888 $this->_lastTerm = null; 1889 $this->_lastTermInfo = null; 1890 $this->_lastTermPositions = null; 1891 1892 return; 1893 } 1894 $searchDicField = $this->_getFieldPosition($searchField); 1895 1896 // search for appropriate value in dictionary 1897 $lowIndex = 0; 1898 $highIndex = count($this->_termDictionary)-1; 1899 while ($highIndex >= $lowIndex) { 1900 // $mid = ($highIndex - $lowIndex)/2; 1901 $mid = ($highIndex + $lowIndex) >> 1; 1902 $midTerm = $this->_termDictionary[$mid]; 1903 1904 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); 1905 $delta = $searchDicField - $fieldNum; 1906 if ($delta == 0) { 1907 $delta = strcmp($prefix->text, $midTerm[1] /* text */); 1908 } 1909 1910 if ($delta < 0) { 1911 $highIndex = $mid-1; 1912 } elseif ($delta > 0) { 1913 $lowIndex = $mid+1; 1914 } else { 1915 // We have reached term we are looking for 1916 break; 1917 } 1918 } 1919 1920 if ($highIndex == -1) { 1921 // Term is out of the dictionary range 1922 $this->_tisFile = null; 1923 $this->_frqFile = null; 1924 $this->_prxFile = null; 1925 1926 $this->_lastTerm = null; 1927 $this->_lastTermInfo = null; 1928 $this->_lastTermPositions = null; 1929 1930 return; 1931 } 1932 1933 $prevPosition = $highIndex; 1934 $prevTerm = $this->_termDictionary[$prevPosition]; 1935 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; 1936 1937 if ($this->_tisFile === null) { 1938 // The end of terms stream is reached and terms dictionary file is closed 1939 // Perform mini-reset operation 1940 $this->_tisFile = $this->openCompoundFile('.tis', false); 1941 1942 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { 1943 $this->_frqFile = $this->openCompoundFile('.frq', false); 1944 $this->_prxFile = $this->openCompoundFile('.prx', false); 1945 } 1946 } 1947 $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET); 1948 1949 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */, 1950 ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name); 1951 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */, 1952 $prevTermInfo[1] /* freqPointer */, 1953 $prevTermInfo[2] /* proxPointer */, 1954 $prevTermInfo[3] /* skipOffset */); 1955 $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval; 1956 1957 if ($highIndex == 0) { 1958 // skip start entry 1959 $this->nextTerm(); 1960 } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) { 1961 // We got exact match in the dictionary index 1962 1963 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { 1964 $this->_lastTermPositions = array(); 1965 1966 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); 1967 $freqs = array(); $docId = 0; 1968 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { 1969 $docDelta = $this->_frqFile->readVInt(); 1970 if( $docDelta % 2 == 1 ) { 1971 $docId += ($docDelta-1)/2; 1972 $freqs[ $docId ] = 1; 1973 } else { 1974 $docId += $docDelta/2; 1975 $freqs[ $docId ] = $this->_frqFile->readVInt(); 1976 } 1977 } 1978 1979 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); 1980 foreach ($freqs as $docId => $freq) { 1981 $termPosition = 0; $positions = array(); 1982 1983 for ($count = 0; $count < $freq; $count++ ) { 1984 $termPosition += $this->_prxFile->readVInt(); 1985 $positions[] = $termPosition; 1986 } 1987 1988 if (isset($this->_docMap[$docId])) { 1989 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; 1990 } 1991 } 1992 } 1993 1994 return; 1995 } 1996 1997 // Search term matching specified prefix 1998 while ($this->_lastTerm !== null) { 1999 if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 || 2000 ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) { 2001 // Current term matches or greate than the pattern 2002 return; 2003 } 2004 2005 $this->nextTerm(); 2006 } 2007 } 2008 2009 2010 /** 2011 * Scans terms dictionary and returns next term 2012 * 2013 * @return Zend_Search_Lucene_Index_Term|null 2014 */ 2015 public function nextTerm() 2016 { 2017 if ($this->_tisFile === null || $this->_termCount == 0) { 2018 $this->_lastTerm = null; 2019 $this->_lastTermInfo = null; 2020 $this->_lastTermPositions = null; 2021 $this->_docMap = null; 2022 2023 // may be necessary for "empty" segment 2024 $this->_tisFile = null; 2025 $this->_frqFile = null; 2026 $this->_prxFile = null; 2027 2028 return null; 2029 } 2030 2031 $termPrefixLength = $this->_tisFile->readVInt(); 2032 $termSuffix = $this->_tisFile->readString(); 2033 $termFieldNum = $this->_tisFile->readVInt(); 2034 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; 2035 2036 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name); 2037 2038 $docFreq = $this->_tisFile->readVInt(); 2039 $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); 2040 $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); 2041 if ($docFreq >= $this->_skipInterval) { 2042 $skipOffset = $this->_tisFile->readVInt(); 2043 } else { 2044 $skipOffset = 0; 2045 } 2046 2047 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); 2048 2049 2050 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { 2051 $this->_lastTermPositions = array(); 2052 2053 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); 2054 $freqs = array(); $docId = 0; 2055 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { 2056 $docDelta = $this->_frqFile->readVInt(); 2057 if( $docDelta % 2 == 1 ) { 2058 $docId += ($docDelta-1)/2; 2059 $freqs[ $docId ] = 1; 2060 } else { 2061 $docId += $docDelta/2; 2062 $freqs[ $docId ] = $this->_frqFile->readVInt(); 2063 } 2064 } 2065 2066 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); 2067 foreach ($freqs as $docId => $freq) { 2068 $termPosition = 0; $positions = array(); 2069 2070 for ($count = 0; $count < $freq; $count++ ) { 2071 $termPosition += $this->_prxFile->readVInt(); 2072 $positions[] = $termPosition; 2073 } 2074 2075 if (isset($this->_docMap[$docId])) { 2076 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; 2077 } 2078 } 2079 } 2080 2081 $this->_termCount--; 2082 if ($this->_termCount == 0) { 2083 $this->_tisFile = null; 2084 $this->_frqFile = null; 2085 $this->_prxFile = null; 2086 } 2087 2088 return $this->_lastTerm; 2089 } 2090 2091 /** 2092 * Close terms stream 2093 * 2094 * Should be used for resources clean up if stream is not read up to the end 2095 */ 2096 public function closeTermsStream() 2097 { 2098 $this->_tisFile = null; 2099 $this->_frqFile = null; 2100 $this->_prxFile = null; 2101 2102 $this->_lastTerm = null; 2103 $this->_lastTermInfo = null; 2104 $this->_lastTermPositions = null; 2105 2106 $this->_docMap = null; 2107 } 2108 2109 2110 /** 2111 * Returns term in current position 2112 * 2113 * @return Zend_Search_Lucene_Index_Term|null 2114 */ 2115 public function currentTerm() 2116 { 2117 return $this->_lastTerm; 2118 } 2119 2120 2121 /** 2122 * Returns an array of all term positions in the documents. 2123 * Return array structure: array( docId => array( pos1, pos2, ...), ...) 2124 * 2125 * @return array 2126 */ 2127 public function currentTermPositions() 2128 { 2129 return $this->_lastTermPositions; 2130 } 2131 } 2132