File indexing completed on 2025-03-02 05:29:43

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Index
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022 
0023 
0024 /** Zend_Search_Lucene_Index_FieldInfo */
0025 // require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
0026 
0027 /** Zend_Search_Lucene_Index_Term */
0028 // require_once 'Zend/Search/Lucene/Index/Term.php';
0029 
0030 /** Zend_Search_Lucene_Index_TermInfo */
0031 // require_once 'Zend/Search/Lucene/Index/TermInfo.php';
0032 
0033 /**
0034  * @category   Zend
0035  * @package    Zend_Search_Lucene
0036  * @subpackage Index
0037  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0038  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0039  */
0040 abstract class Zend_Search_Lucene_Index_SegmentWriter
0041 {
0042     /**
0043      * Expert: The fraction of terms in the "dictionary" which should be stored
0044      * in RAM.  Smaller values use more memory, but make searching slightly
0045      * faster, while larger values use less memory and make searching slightly
0046      * slower.  Searching is typically not dominated by dictionary lookup, so
0047      * tweaking this is rarely useful.
0048      *
0049      * @var integer
0050      */
0051     public static $indexInterval = 128;
0052 
0053     /**
0054      * Expert: The fraction of TermDocs entries stored in skip tables.
0055      * Larger values result in smaller indexes, greater acceleration, but fewer
0056      * accelerable cases, while smaller values result in bigger indexes,
0057      * less acceleration and more
0058      * accelerable cases. More detailed experiments would be useful here.
0059      *
0060      * 0x7FFFFFFF indicates that we don't use skip data
0061      *
0062      * Note: not used in current implementation
0063      *
0064      * @var integer
0065      */
0066     public static $skipInterval = 0x7FFFFFFF;
0067 
0068     /**
0069      * Expert: The maximum number of skip levels. Smaller values result in
0070      * slightly smaller indexes, but slower skipping in big posting lists.
0071      *
0072      * 0 indicates that we don't use skip data
0073      *
0074      * Note: not used in current implementation
0075      *
0076      * @var integer
0077      */
0078     public static $maxSkipLevels = 0;
0079 
0080     /**
0081      * Number of docs in a segment
0082      *
0083      * @var integer
0084      */
0085     protected $_docCount = 0;
0086 
0087     /**
0088      * Segment name
0089      *
0090      * @var string
0091      */
0092     protected $_name;
0093 
0094     /**
0095      * File system adapter.
0096      *
0097      * @var Zend_Search_Lucene_Storage_Directory
0098      */
0099     protected $_directory;
0100 
0101     /**
0102      * List of the index files.
0103      * Used for automatic compound file generation
0104      *
0105      * @var unknown_type
0106      */
0107     protected $_files = array();
0108 
0109     /**
0110      * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
0111      *
0112      * @var array
0113      */
0114     protected $_fields = array();
0115 
0116     /**
0117      * Normalization factors.
0118      * An array fieldName => normVector
0119      * normVector is a binary string.
0120      * Each byte corresponds to an indexed document in a segment and
0121      * encodes normalization factor (float value, encoded by
0122      * Zend_Search_Lucene_Search_Similarity::encodeNorm())
0123      *
0124      * @var array
0125      */
0126     protected $_norms = array();
0127 
0128 
0129     /**
0130      * '.fdx'  file - Stored Fields, the field index.
0131      *
0132      * @var Zend_Search_Lucene_Storage_File
0133      */
0134     protected $_fdxFile = null;
0135 
0136     /**
0137      * '.fdt'  file - Stored Fields, the field data.
0138      *
0139      * @var Zend_Search_Lucene_Storage_File
0140      */
0141     protected $_fdtFile = null;
0142 
0143 
0144     /**
0145      * Object constructor.
0146      *
0147      * @param Zend_Search_Lucene_Storage_Directory $directory
0148      * @param string $name
0149      */
0150     public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
0151     {
0152         $this->_directory = $directory;
0153         $this->_name      = $name;
0154     }
0155 
0156 
0157     /**
0158      * Add field to the segment
0159      *
0160      * Returns actual field number
0161      *
0162      * @param Zend_Search_Lucene_Field $field
0163      * @return integer
0164      */
0165     public function addField(Zend_Search_Lucene_Field $field)
0166     {
0167         if (!isset($this->_fields[$field->name])) {
0168             $fieldNumber = count($this->_fields);
0169             $this->_fields[$field->name] =
0170                                 new Zend_Search_Lucene_Index_FieldInfo($field->name,
0171                                                                        $field->isIndexed,
0172                                                                        $fieldNumber,
0173                                                                        $field->storeTermVector);
0174 
0175             return $fieldNumber;
0176         } else {
0177             $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
0178             $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
0179 
0180             return $this->_fields[$field->name]->number;
0181         }
0182     }
0183 
0184     /**
0185      * Add fieldInfo to the segment
0186      *
0187      * Returns actual field number
0188      *
0189      * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
0190      * @return integer
0191      */
0192     public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
0193     {
0194         if (!isset($this->_fields[$fieldInfo->name])) {
0195             $fieldNumber = count($this->_fields);
0196             $this->_fields[$fieldInfo->name] =
0197                                 new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
0198                                                                        $fieldInfo->isIndexed,
0199                                                                        $fieldNumber,
0200                                                                        $fieldInfo->storeTermVector);
0201 
0202             return $fieldNumber;
0203         } else {
0204             $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
0205             $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
0206 
0207             return $this->_fields[$fieldInfo->name]->number;
0208         }
0209     }
0210 
0211     /**
0212      * Returns array of FieldInfo objects.
0213      *
0214      * @return array
0215      */
0216     public function getFieldInfos()
0217     {
0218         return $this->_fields;
0219     }
0220 
0221     /**
0222      * Add stored fields information
0223      *
0224      * @param array $storedFields array of Zend_Search_Lucene_Field objects
0225      */
0226     public function addStoredFields($storedFields)
0227     {
0228         if (!isset($this->_fdxFile)) {
0229             $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
0230             $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
0231 
0232             $this->_files[] = $this->_name . '.fdx';
0233             $this->_files[] = $this->_name . '.fdt';
0234         }
0235 
0236         $this->_fdxFile->writeLong($this->_fdtFile->tell());
0237         $this->_fdtFile->writeVInt(count($storedFields));
0238         foreach ($storedFields as $field) {
0239             $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
0240             $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
0241                          ($field->isBinary ?    0x02 : 0x00) |
0242                          0x00; /* 0x04 - third bit, compressed (ZLIB) */
0243             $this->_fdtFile->writeByte($fieldBits);
0244             if ($field->isBinary) {
0245                 $this->_fdtFile->writeVInt(strlen($field->value));
0246                 $this->_fdtFile->writeBytes($field->value);
0247             } else {
0248                 $this->_fdtFile->writeString($field->getUtf8Value());
0249             }
0250         }
0251 
0252         $this->_docCount++;
0253     }
0254 
0255     /**
0256      * Returns the total number of documents in this segment.
0257      *
0258      * @return integer
0259      */
0260     public function count()
0261     {
0262         return $this->_docCount;
0263     }
0264 
0265     /**
0266      * Return segment name
0267      *
0268      * @return string
0269      */
0270     public function getName()
0271     {
0272         return $this->_name;
0273     }
0274 
0275     /**
0276      * Dump Field Info (.fnm) segment file
0277      */
0278     protected function _dumpFNM()
0279     {
0280         $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
0281         $fnmFile->writeVInt(count($this->_fields));
0282 
0283         $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
0284         // Write header
0285         $nrmFile->writeBytes('NRM');
0286         // Write format specifier
0287         $nrmFile->writeByte((int)0xFF);
0288 
0289         foreach ($this->_fields as $field) {
0290             $fnmFile->writeString($field->name);
0291             $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
0292                                 ($field->storeTermVector ? 0x02 : 0x00)
0293 // not supported yet            0x04 /* term positions are stored with the term vectors */ |
0294 // not supported yet            0x08 /* term offsets are stored with the term vectors */   |
0295                                );
0296 
0297             if ($field->isIndexed) {
0298                 // pre-2.1 index mode (not used now)
0299                 // $normFileName = $this->_name . '.f' . $field->number;
0300                 // $fFile = $this->_directory->createFile($normFileName);
0301                 // $fFile->writeBytes($this->_norms[$field->name]);
0302                 // $this->_files[] = $normFileName;
0303 
0304                 $nrmFile->writeBytes($this->_norms[$field->name]);
0305             }
0306         }
0307 
0308         $this->_files[] = $this->_name . '.fnm';
0309         $this->_files[] = $this->_name . '.nrm';
0310     }
0311 
0312 
0313 
0314     /**
0315      * Term Dictionary file
0316      *
0317      * @var Zend_Search_Lucene_Storage_File
0318      */
0319     private $_tisFile = null;
0320 
0321     /**
0322      * Term Dictionary index file
0323      *
0324      * @var Zend_Search_Lucene_Storage_File
0325      */
0326     private $_tiiFile = null;
0327 
0328     /**
0329      * Frequencies file
0330      *
0331      * @var Zend_Search_Lucene_Storage_File
0332      */
0333     private $_frqFile = null;
0334 
0335     /**
0336      * Positions file
0337      *
0338      * @var Zend_Search_Lucene_Storage_File
0339      */
0340     private $_prxFile = null;
0341 
0342     /**
0343      * Number of written terms
0344      *
0345      * @var integer
0346      */
0347     private $_termCount;
0348 
0349 
0350     /**
0351      * Last saved term
0352      *
0353      * @var Zend_Search_Lucene_Index_Term
0354      */
0355     private $_prevTerm;
0356 
0357     /**
0358      * Last saved term info
0359      *
0360      * @var Zend_Search_Lucene_Index_TermInfo
0361      */
0362     private $_prevTermInfo;
0363 
0364     /**
0365      * Last saved index term
0366      *
0367      * @var Zend_Search_Lucene_Index_Term
0368      */
0369     private $_prevIndexTerm;
0370 
0371     /**
0372      * Last saved index term info
0373      *
0374      * @var Zend_Search_Lucene_Index_TermInfo
0375      */
0376     private $_prevIndexTermInfo;
0377 
0378     /**
0379      * Last term dictionary file position
0380      *
0381      * @var integer
0382      */
0383     private $_lastIndexPosition;
0384 
0385     /**
0386      * Create dicrionary, frequency and positions files and write necessary headers
0387      */
0388     public function initializeDictionaryFiles()
0389     {
0390         $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
0391         $this->_tisFile->writeInt((int)0xFFFFFFFD);
0392         $this->_tisFile->writeLong(0 /* dummy data for terms count */);
0393         $this->_tisFile->writeInt(self::$indexInterval);
0394         $this->_tisFile->writeInt(self::$skipInterval);
0395         $this->_tisFile->writeInt(self::$maxSkipLevels);
0396 
0397         $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
0398         $this->_tiiFile->writeInt((int)0xFFFFFFFD);
0399         $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
0400         $this->_tiiFile->writeInt(self::$indexInterval);
0401         $this->_tiiFile->writeInt(self::$skipInterval);
0402         $this->_tiiFile->writeInt(self::$maxSkipLevels);
0403 
0404         /** Dump dictionary header */
0405         $this->_tiiFile->writeVInt(0);                    // preffix length
0406         $this->_tiiFile->writeString('');                 // suffix
0407         $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
0408         $this->_tiiFile->writeByte((int)0x0F);
0409         $this->_tiiFile->writeVInt(0);                    // DocFreq
0410         $this->_tiiFile->writeVInt(0);                    // FreqDelta
0411         $this->_tiiFile->writeVInt(0);                    // ProxDelta
0412         $this->_tiiFile->writeVInt(24);                   // IndexDelta
0413 
0414         $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
0415         $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
0416 
0417         $this->_files[] = $this->_name . '.tis';
0418         $this->_files[] = $this->_name . '.tii';
0419         $this->_files[] = $this->_name . '.frq';
0420         $this->_files[] = $this->_name . '.prx';
0421 
0422         $this->_prevTerm          = null;
0423         $this->_prevTermInfo      = null;
0424         $this->_prevIndexTerm     = null;
0425         $this->_prevIndexTermInfo = null;
0426         $this->_lastIndexPosition = 24;
0427         $this->_termCount         = 0;
0428 
0429     }
0430 
0431     /**
0432      * Add term
0433      *
0434      * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
0435      *
0436      * @param Zend_Search_Lucene_Index_Term $termEntry
0437      * @param array $termDocs
0438      */
0439     public function addTerm($termEntry, $termDocs)
0440     {
0441         $freqPointer = $this->_frqFile->tell();
0442         $proxPointer = $this->_prxFile->tell();
0443 
0444         $prevDoc = 0;
0445         foreach ($termDocs as $docId => $termPositions) {
0446             $docDelta = ($docId - $prevDoc)*2;
0447             $prevDoc = $docId;
0448             if (count($termPositions) > 1) {
0449                 $this->_frqFile->writeVInt($docDelta);
0450                 $this->_frqFile->writeVInt(count($termPositions));
0451             } else {
0452                 $this->_frqFile->writeVInt($docDelta + 1);
0453             }
0454 
0455             $prevPosition = 0;
0456             foreach ($termPositions as $position) {
0457                 $this->_prxFile->writeVInt($position - $prevPosition);
0458                 $prevPosition = $position;
0459             }
0460         }
0461 
0462         if (count($termDocs) >= self::$skipInterval) {
0463             /**
0464              * @todo Write Skip Data to a freq file.
0465              * It's not used now, but make index more optimal
0466              */
0467             $skipOffset = $this->_frqFile->tell() - $freqPointer;
0468         } else {
0469             $skipOffset = 0;
0470         }
0471 
0472         $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
0473                                                   $this->_fields[$termEntry->field]->number);
0474         $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
0475                                                           $freqPointer, $proxPointer, $skipOffset);
0476 
0477         $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
0478 
0479         if (($this->_termCount + 1) % self::$indexInterval == 0) {
0480             $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
0481 
0482             $indexPosition = $this->_tisFile->tell();
0483             $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
0484             $this->_lastIndexPosition = $indexPosition;
0485 
0486         }
0487         $this->_termCount++;
0488     }
0489 
0490     /**
0491      * Close dictionary
0492      */
0493     public function closeDictionaryFiles()
0494     {
0495         $this->_tisFile->seek(4);
0496         $this->_tisFile->writeLong($this->_termCount);
0497 
0498         $this->_tiiFile->seek(4);
0499         // + 1 is used to count an additional special index entry (empty term at the start of the list)
0500         $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
0501     }
0502 
0503 
0504     /**
0505      * Dump Term Dictionary segment file entry.
0506      * Used to write entry to .tis or .tii files
0507      *
0508      * @param Zend_Search_Lucene_Storage_File $dicFile
0509      * @param Zend_Search_Lucene_Index_Term $prevTerm
0510      * @param Zend_Search_Lucene_Index_Term $term
0511      * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
0512      * @param Zend_Search_Lucene_Index_TermInfo $termInfo
0513      */
0514     protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
0515                                         &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
0516                                         &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
0517     {
0518         if (isset($prevTerm) && $prevTerm->field == $term->field) {
0519             $matchedBytes = 0;
0520             $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
0521             while ($matchedBytes < $maxBytes  &&
0522                    $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
0523                 $matchedBytes++;
0524             }
0525 
0526             // Calculate actual matched UTF-8 pattern
0527             $prefixBytes = 0;
0528             $prefixChars = 0;
0529             while ($prefixBytes < $matchedBytes) {
0530                 $charBytes = 1;
0531                 if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
0532                     $charBytes++;
0533                     if (ord($term->text[$prefixBytes]) & 0x20 ) {
0534                         $charBytes++;
0535                         if (ord($term->text[$prefixBytes]) & 0x10 ) {
0536                             $charBytes++;
0537                         }
0538                     }
0539                 }
0540 
0541                 if ($prefixBytes + $charBytes > $matchedBytes) {
0542                     // char crosses matched bytes boundary
0543                     // skip char
0544                     break;
0545                 }
0546 
0547                 $prefixChars++;
0548                 $prefixBytes += $charBytes;
0549             }
0550 
0551             // Write preffix length
0552             $dicFile->writeVInt($prefixChars);
0553             // Write suffix
0554             $dicFile->writeString(substr($term->text, $prefixBytes));
0555         } else {
0556             // Write preffix length
0557             $dicFile->writeVInt(0);
0558             // Write suffix
0559             $dicFile->writeString($term->text);
0560         }
0561         // Write field number
0562         $dicFile->writeVInt($term->field);
0563         // DocFreq (the count of documents which contain the term)
0564         $dicFile->writeVInt($termInfo->docFreq);
0565 
0566         $prevTerm = $term;
0567 
0568         if (!isset($prevTermInfo)) {
0569             // Write FreqDelta
0570             $dicFile->writeVInt($termInfo->freqPointer);
0571             // Write ProxDelta
0572             $dicFile->writeVInt($termInfo->proxPointer);
0573         } else {
0574             // Write FreqDelta
0575             $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
0576             // Write ProxDelta
0577             $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
0578         }
0579         // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
0580         if ($termInfo->skipOffset != 0) {
0581             $dicFile->writeVInt($termInfo->skipOffset);
0582         }
0583 
0584         $prevTermInfo = $termInfo;
0585     }
0586 
0587 
0588     /**
0589      * Generate compound index file
0590      */
0591     protected function _generateCFS()
0592     {
0593         $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
0594         $cfsFile->writeVInt(count($this->_files));
0595 
0596         $dataOffsetPointers = array();
0597         foreach ($this->_files as $fileName) {
0598             $dataOffsetPointers[$fileName] = $cfsFile->tell();
0599             $cfsFile->writeLong(0); // write dummy data
0600             $cfsFile->writeString($fileName);
0601         }
0602 
0603         foreach ($this->_files as $fileName) {
0604             // Get actual data offset
0605             $dataOffset = $cfsFile->tell();
0606             // Seek to the data offset pointer
0607             $cfsFile->seek($dataOffsetPointers[$fileName]);
0608             // Write actual data offset value
0609             $cfsFile->writeLong($dataOffset);
0610             // Seek back to the end of file
0611             $cfsFile->seek($dataOffset);
0612 
0613             $dataFile = $this->_directory->getFileObject($fileName);
0614 
0615             $byteCount = $this->_directory->fileLength($fileName);
0616             while ($byteCount > 0) {
0617                 $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
0618                 $byteCount -= strlen($data);
0619                 $cfsFile->writeBytes($data);
0620             }
0621 
0622             $this->_directory->deleteFile($fileName);
0623         }
0624     }
0625 
0626 
0627     /**
0628      * Close segment, write it to disk and return segment info
0629      *
0630      * @return Zend_Search_Lucene_Index_SegmentInfo
0631      */
0632     abstract public function close();
0633 }
0634