File indexing completed on 2025-03-02 05:29:43
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Index 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 0024 /** Zend_Search_Lucene_Index_FieldInfo */ 0025 // require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; 0026 0027 /** Zend_Search_Lucene_Index_Term */ 0028 // require_once 'Zend/Search/Lucene/Index/Term.php'; 0029 0030 /** Zend_Search_Lucene_Index_TermInfo */ 0031 // require_once 'Zend/Search/Lucene/Index/TermInfo.php'; 0032 0033 /** 0034 * @category Zend 0035 * @package Zend_Search_Lucene 0036 * @subpackage Index 0037 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0038 * @license http://framework.zend.com/license/new-bsd New BSD License 0039 */ 0040 abstract class Zend_Search_Lucene_Index_SegmentWriter 0041 { 0042 /** 0043 * Expert: The fraction of terms in the "dictionary" which should be stored 0044 * in RAM. Smaller values use more memory, but make searching slightly 0045 * faster, while larger values use less memory and make searching slightly 0046 * slower. Searching is typically not dominated by dictionary lookup, so 0047 * tweaking this is rarely useful. 0048 * 0049 * @var integer 0050 */ 0051 public static $indexInterval = 128; 0052 0053 /** 0054 * Expert: The fraction of TermDocs entries stored in skip tables. 0055 * Larger values result in smaller indexes, greater acceleration, but fewer 0056 * accelerable cases, while smaller values result in bigger indexes, 0057 * less acceleration and more 0058 * accelerable cases. More detailed experiments would be useful here. 0059 * 0060 * 0x7FFFFFFF indicates that we don't use skip data 0061 * 0062 * Note: not used in current implementation 0063 * 0064 * @var integer 0065 */ 0066 public static $skipInterval = 0x7FFFFFFF; 0067 0068 /** 0069 * Expert: The maximum number of skip levels. Smaller values result in 0070 * slightly smaller indexes, but slower skipping in big posting lists. 0071 * 0072 * 0 indicates that we don't use skip data 0073 * 0074 * Note: not used in current implementation 0075 * 0076 * @var integer 0077 */ 0078 public static $maxSkipLevels = 0; 0079 0080 /** 0081 * Number of docs in a segment 0082 * 0083 * @var integer 0084 */ 0085 protected $_docCount = 0; 0086 0087 /** 0088 * Segment name 0089 * 0090 * @var string 0091 */ 0092 protected $_name; 0093 0094 /** 0095 * File system adapter. 0096 * 0097 * @var Zend_Search_Lucene_Storage_Directory 0098 */ 0099 protected $_directory; 0100 0101 /** 0102 * List of the index files. 0103 * Used for automatic compound file generation 0104 * 0105 * @var unknown_type 0106 */ 0107 protected $_files = array(); 0108 0109 /** 0110 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment 0111 * 0112 * @var array 0113 */ 0114 protected $_fields = array(); 0115 0116 /** 0117 * Normalization factors. 0118 * An array fieldName => normVector 0119 * normVector is a binary string. 0120 * Each byte corresponds to an indexed document in a segment and 0121 * encodes normalization factor (float value, encoded by 0122 * Zend_Search_Lucene_Search_Similarity::encodeNorm()) 0123 * 0124 * @var array 0125 */ 0126 protected $_norms = array(); 0127 0128 0129 /** 0130 * '.fdx' file - Stored Fields, the field index. 0131 * 0132 * @var Zend_Search_Lucene_Storage_File 0133 */ 0134 protected $_fdxFile = null; 0135 0136 /** 0137 * '.fdt' file - Stored Fields, the field data. 0138 * 0139 * @var Zend_Search_Lucene_Storage_File 0140 */ 0141 protected $_fdtFile = null; 0142 0143 0144 /** 0145 * Object constructor. 0146 * 0147 * @param Zend_Search_Lucene_Storage_Directory $directory 0148 * @param string $name 0149 */ 0150 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) 0151 { 0152 $this->_directory = $directory; 0153 $this->_name = $name; 0154 } 0155 0156 0157 /** 0158 * Add field to the segment 0159 * 0160 * Returns actual field number 0161 * 0162 * @param Zend_Search_Lucene_Field $field 0163 * @return integer 0164 */ 0165 public function addField(Zend_Search_Lucene_Field $field) 0166 { 0167 if (!isset($this->_fields[$field->name])) { 0168 $fieldNumber = count($this->_fields); 0169 $this->_fields[$field->name] = 0170 new Zend_Search_Lucene_Index_FieldInfo($field->name, 0171 $field->isIndexed, 0172 $fieldNumber, 0173 $field->storeTermVector); 0174 0175 return $fieldNumber; 0176 } else { 0177 $this->_fields[$field->name]->isIndexed |= $field->isIndexed; 0178 $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; 0179 0180 return $this->_fields[$field->name]->number; 0181 } 0182 } 0183 0184 /** 0185 * Add fieldInfo to the segment 0186 * 0187 * Returns actual field number 0188 * 0189 * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo 0190 * @return integer 0191 */ 0192 public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo) 0193 { 0194 if (!isset($this->_fields[$fieldInfo->name])) { 0195 $fieldNumber = count($this->_fields); 0196 $this->_fields[$fieldInfo->name] = 0197 new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name, 0198 $fieldInfo->isIndexed, 0199 $fieldNumber, 0200 $fieldInfo->storeTermVector); 0201 0202 return $fieldNumber; 0203 } else { 0204 $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed; 0205 $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector; 0206 0207 return $this->_fields[$fieldInfo->name]->number; 0208 } 0209 } 0210 0211 /** 0212 * Returns array of FieldInfo objects. 0213 * 0214 * @return array 0215 */ 0216 public function getFieldInfos() 0217 { 0218 return $this->_fields; 0219 } 0220 0221 /** 0222 * Add stored fields information 0223 * 0224 * @param array $storedFields array of Zend_Search_Lucene_Field objects 0225 */ 0226 public function addStoredFields($storedFields) 0227 { 0228 if (!isset($this->_fdxFile)) { 0229 $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); 0230 $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); 0231 0232 $this->_files[] = $this->_name . '.fdx'; 0233 $this->_files[] = $this->_name . '.fdt'; 0234 } 0235 0236 $this->_fdxFile->writeLong($this->_fdtFile->tell()); 0237 $this->_fdtFile->writeVInt(count($storedFields)); 0238 foreach ($storedFields as $field) { 0239 $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); 0240 $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | 0241 ($field->isBinary ? 0x02 : 0x00) | 0242 0x00; /* 0x04 - third bit, compressed (ZLIB) */ 0243 $this->_fdtFile->writeByte($fieldBits); 0244 if ($field->isBinary) { 0245 $this->_fdtFile->writeVInt(strlen($field->value)); 0246 $this->_fdtFile->writeBytes($field->value); 0247 } else { 0248 $this->_fdtFile->writeString($field->getUtf8Value()); 0249 } 0250 } 0251 0252 $this->_docCount++; 0253 } 0254 0255 /** 0256 * Returns the total number of documents in this segment. 0257 * 0258 * @return integer 0259 */ 0260 public function count() 0261 { 0262 return $this->_docCount; 0263 } 0264 0265 /** 0266 * Return segment name 0267 * 0268 * @return string 0269 */ 0270 public function getName() 0271 { 0272 return $this->_name; 0273 } 0274 0275 /** 0276 * Dump Field Info (.fnm) segment file 0277 */ 0278 protected function _dumpFNM() 0279 { 0280 $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); 0281 $fnmFile->writeVInt(count($this->_fields)); 0282 0283 $nrmFile = $this->_directory->createFile($this->_name . '.nrm'); 0284 // Write header 0285 $nrmFile->writeBytes('NRM'); 0286 // Write format specifier 0287 $nrmFile->writeByte((int)0xFF); 0288 0289 foreach ($this->_fields as $field) { 0290 $fnmFile->writeString($field->name); 0291 $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | 0292 ($field->storeTermVector ? 0x02 : 0x00) 0293 // not supported yet 0x04 /* term positions are stored with the term vectors */ | 0294 // not supported yet 0x08 /* term offsets are stored with the term vectors */ | 0295 ); 0296 0297 if ($field->isIndexed) { 0298 // pre-2.1 index mode (not used now) 0299 // $normFileName = $this->_name . '.f' . $field->number; 0300 // $fFile = $this->_directory->createFile($normFileName); 0301 // $fFile->writeBytes($this->_norms[$field->name]); 0302 // $this->_files[] = $normFileName; 0303 0304 $nrmFile->writeBytes($this->_norms[$field->name]); 0305 } 0306 } 0307 0308 $this->_files[] = $this->_name . '.fnm'; 0309 $this->_files[] = $this->_name . '.nrm'; 0310 } 0311 0312 0313 0314 /** 0315 * Term Dictionary file 0316 * 0317 * @var Zend_Search_Lucene_Storage_File 0318 */ 0319 private $_tisFile = null; 0320 0321 /** 0322 * Term Dictionary index file 0323 * 0324 * @var Zend_Search_Lucene_Storage_File 0325 */ 0326 private $_tiiFile = null; 0327 0328 /** 0329 * Frequencies file 0330 * 0331 * @var Zend_Search_Lucene_Storage_File 0332 */ 0333 private $_frqFile = null; 0334 0335 /** 0336 * Positions file 0337 * 0338 * @var Zend_Search_Lucene_Storage_File 0339 */ 0340 private $_prxFile = null; 0341 0342 /** 0343 * Number of written terms 0344 * 0345 * @var integer 0346 */ 0347 private $_termCount; 0348 0349 0350 /** 0351 * Last saved term 0352 * 0353 * @var Zend_Search_Lucene_Index_Term 0354 */ 0355 private $_prevTerm; 0356 0357 /** 0358 * Last saved term info 0359 * 0360 * @var Zend_Search_Lucene_Index_TermInfo 0361 */ 0362 private $_prevTermInfo; 0363 0364 /** 0365 * Last saved index term 0366 * 0367 * @var Zend_Search_Lucene_Index_Term 0368 */ 0369 private $_prevIndexTerm; 0370 0371 /** 0372 * Last saved index term info 0373 * 0374 * @var Zend_Search_Lucene_Index_TermInfo 0375 */ 0376 private $_prevIndexTermInfo; 0377 0378 /** 0379 * Last term dictionary file position 0380 * 0381 * @var integer 0382 */ 0383 private $_lastIndexPosition; 0384 0385 /** 0386 * Create dicrionary, frequency and positions files and write necessary headers 0387 */ 0388 public function initializeDictionaryFiles() 0389 { 0390 $this->_tisFile = $this->_directory->createFile($this->_name . '.tis'); 0391 $this->_tisFile->writeInt((int)0xFFFFFFFD); 0392 $this->_tisFile->writeLong(0 /* dummy data for terms count */); 0393 $this->_tisFile->writeInt(self::$indexInterval); 0394 $this->_tisFile->writeInt(self::$skipInterval); 0395 $this->_tisFile->writeInt(self::$maxSkipLevels); 0396 0397 $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii'); 0398 $this->_tiiFile->writeInt((int)0xFFFFFFFD); 0399 $this->_tiiFile->writeLong(0 /* dummy data for terms count */); 0400 $this->_tiiFile->writeInt(self::$indexInterval); 0401 $this->_tiiFile->writeInt(self::$skipInterval); 0402 $this->_tiiFile->writeInt(self::$maxSkipLevels); 0403 0404 /** Dump dictionary header */ 0405 $this->_tiiFile->writeVInt(0); // preffix length 0406 $this->_tiiFile->writeString(''); // suffix 0407 $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number 0408 $this->_tiiFile->writeByte((int)0x0F); 0409 $this->_tiiFile->writeVInt(0); // DocFreq 0410 $this->_tiiFile->writeVInt(0); // FreqDelta 0411 $this->_tiiFile->writeVInt(0); // ProxDelta 0412 $this->_tiiFile->writeVInt(24); // IndexDelta 0413 0414 $this->_frqFile = $this->_directory->createFile($this->_name . '.frq'); 0415 $this->_prxFile = $this->_directory->createFile($this->_name . '.prx'); 0416 0417 $this->_files[] = $this->_name . '.tis'; 0418 $this->_files[] = $this->_name . '.tii'; 0419 $this->_files[] = $this->_name . '.frq'; 0420 $this->_files[] = $this->_name . '.prx'; 0421 0422 $this->_prevTerm = null; 0423 $this->_prevTermInfo = null; 0424 $this->_prevIndexTerm = null; 0425 $this->_prevIndexTermInfo = null; 0426 $this->_lastIndexPosition = 24; 0427 $this->_termCount = 0; 0428 0429 } 0430 0431 /** 0432 * Add term 0433 * 0434 * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) 0435 * 0436 * @param Zend_Search_Lucene_Index_Term $termEntry 0437 * @param array $termDocs 0438 */ 0439 public function addTerm($termEntry, $termDocs) 0440 { 0441 $freqPointer = $this->_frqFile->tell(); 0442 $proxPointer = $this->_prxFile->tell(); 0443 0444 $prevDoc = 0; 0445 foreach ($termDocs as $docId => $termPositions) { 0446 $docDelta = ($docId - $prevDoc)*2; 0447 $prevDoc = $docId; 0448 if (count($termPositions) > 1) { 0449 $this->_frqFile->writeVInt($docDelta); 0450 $this->_frqFile->writeVInt(count($termPositions)); 0451 } else { 0452 $this->_frqFile->writeVInt($docDelta + 1); 0453 } 0454 0455 $prevPosition = 0; 0456 foreach ($termPositions as $position) { 0457 $this->_prxFile->writeVInt($position - $prevPosition); 0458 $prevPosition = $position; 0459 } 0460 } 0461 0462 if (count($termDocs) >= self::$skipInterval) { 0463 /** 0464 * @todo Write Skip Data to a freq file. 0465 * It's not used now, but make index more optimal 0466 */ 0467 $skipOffset = $this->_frqFile->tell() - $freqPointer; 0468 } else { 0469 $skipOffset = 0; 0470 } 0471 0472 $term = new Zend_Search_Lucene_Index_Term($termEntry->text, 0473 $this->_fields[$termEntry->field]->number); 0474 $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), 0475 $freqPointer, $proxPointer, $skipOffset); 0476 0477 $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo); 0478 0479 if (($this->_termCount + 1) % self::$indexInterval == 0) { 0480 $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo); 0481 0482 $indexPosition = $this->_tisFile->tell(); 0483 $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); 0484 $this->_lastIndexPosition = $indexPosition; 0485 0486 } 0487 $this->_termCount++; 0488 } 0489 0490 /** 0491 * Close dictionary 0492 */ 0493 public function closeDictionaryFiles() 0494 { 0495 $this->_tisFile->seek(4); 0496 $this->_tisFile->writeLong($this->_termCount); 0497 0498 $this->_tiiFile->seek(4); 0499 // + 1 is used to count an additional special index entry (empty term at the start of the list) 0500 $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1); 0501 } 0502 0503 0504 /** 0505 * Dump Term Dictionary segment file entry. 0506 * Used to write entry to .tis or .tii files 0507 * 0508 * @param Zend_Search_Lucene_Storage_File $dicFile 0509 * @param Zend_Search_Lucene_Index_Term $prevTerm 0510 * @param Zend_Search_Lucene_Index_Term $term 0511 * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo 0512 * @param Zend_Search_Lucene_Index_TermInfo $termInfo 0513 */ 0514 protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, 0515 &$prevTerm, Zend_Search_Lucene_Index_Term $term, 0516 &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) 0517 { 0518 if (isset($prevTerm) && $prevTerm->field == $term->field) { 0519 $matchedBytes = 0; 0520 $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); 0521 while ($matchedBytes < $maxBytes && 0522 $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { 0523 $matchedBytes++; 0524 } 0525 0526 // Calculate actual matched UTF-8 pattern 0527 $prefixBytes = 0; 0528 $prefixChars = 0; 0529 while ($prefixBytes < $matchedBytes) { 0530 $charBytes = 1; 0531 if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) { 0532 $charBytes++; 0533 if (ord($term->text[$prefixBytes]) & 0x20 ) { 0534 $charBytes++; 0535 if (ord($term->text[$prefixBytes]) & 0x10 ) { 0536 $charBytes++; 0537 } 0538 } 0539 } 0540 0541 if ($prefixBytes + $charBytes > $matchedBytes) { 0542 // char crosses matched bytes boundary 0543 // skip char 0544 break; 0545 } 0546 0547 $prefixChars++; 0548 $prefixBytes += $charBytes; 0549 } 0550 0551 // Write preffix length 0552 $dicFile->writeVInt($prefixChars); 0553 // Write suffix 0554 $dicFile->writeString(substr($term->text, $prefixBytes)); 0555 } else { 0556 // Write preffix length 0557 $dicFile->writeVInt(0); 0558 // Write suffix 0559 $dicFile->writeString($term->text); 0560 } 0561 // Write field number 0562 $dicFile->writeVInt($term->field); 0563 // DocFreq (the count of documents which contain the term) 0564 $dicFile->writeVInt($termInfo->docFreq); 0565 0566 $prevTerm = $term; 0567 0568 if (!isset($prevTermInfo)) { 0569 // Write FreqDelta 0570 $dicFile->writeVInt($termInfo->freqPointer); 0571 // Write ProxDelta 0572 $dicFile->writeVInt($termInfo->proxPointer); 0573 } else { 0574 // Write FreqDelta 0575 $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); 0576 // Write ProxDelta 0577 $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); 0578 } 0579 // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval 0580 if ($termInfo->skipOffset != 0) { 0581 $dicFile->writeVInt($termInfo->skipOffset); 0582 } 0583 0584 $prevTermInfo = $termInfo; 0585 } 0586 0587 0588 /** 0589 * Generate compound index file 0590 */ 0591 protected function _generateCFS() 0592 { 0593 $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); 0594 $cfsFile->writeVInt(count($this->_files)); 0595 0596 $dataOffsetPointers = array(); 0597 foreach ($this->_files as $fileName) { 0598 $dataOffsetPointers[$fileName] = $cfsFile->tell(); 0599 $cfsFile->writeLong(0); // write dummy data 0600 $cfsFile->writeString($fileName); 0601 } 0602 0603 foreach ($this->_files as $fileName) { 0604 // Get actual data offset 0605 $dataOffset = $cfsFile->tell(); 0606 // Seek to the data offset pointer 0607 $cfsFile->seek($dataOffsetPointers[$fileName]); 0608 // Write actual data offset value 0609 $cfsFile->writeLong($dataOffset); 0610 // Seek back to the end of file 0611 $cfsFile->seek($dataOffset); 0612 0613 $dataFile = $this->_directory->getFileObject($fileName); 0614 0615 $byteCount = $this->_directory->fileLength($fileName); 0616 while ($byteCount > 0) { 0617 $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/)); 0618 $byteCount -= strlen($data); 0619 $cfsFile->writeBytes($data); 0620 } 0621 0622 $this->_directory->deleteFile($fileName); 0623 } 0624 } 0625 0626 0627 /** 0628 * Close segment, write it to disk and return segment info 0629 * 0630 * @return Zend_Search_Lucene_Index_SegmentInfo 0631 */ 0632 abstract public function close(); 0633 } 0634