File indexing completed on 2025-03-02 05:29:43

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Index
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022 
0023 /** Zend_Search_Lucene_Index_SegmentWriter */
0024 // require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
0025 
0026 /**
0027  * @category   Zend
0028  * @package    Zend_Search_Lucene
0029  * @subpackage Index
0030  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0031  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0032  */
0033 class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
0034 {
0035     /**
0036      * Term Dictionary
0037      * Array of the Zend_Search_Lucene_Index_Term objects
0038      * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
0039      *
0040      * @var array
0041      */
0042     protected $_termDictionary;
0043 
0044     /**
0045      * Documents, which contain the term
0046      *
0047      * @var array
0048      */
0049     protected $_termDocs;
0050 
0051     /**
0052      * Object constructor.
0053      *
0054      * @param Zend_Search_Lucene_Storage_Directory $directory
0055      * @param string $name
0056      */
0057     public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
0058     {
0059         parent::__construct($directory, $name);
0060 
0061         $this->_termDocs       = array();
0062         $this->_termDictionary = array();
0063     }
0064 
0065 
0066     /**
0067      * Adds a document to this segment.
0068      *
0069      * @param Zend_Search_Lucene_Document $document
0070      * @throws Zend_Search_Lucene_Exception
0071      */
0072     public function addDocument(Zend_Search_Lucene_Document $document)
0073     {
0074         /** Zend_Search_Lucene_Search_Similarity */
0075         // require_once 'Zend/Search/Lucene/Search/Similarity.php';
0076 
0077         $storedFields = array();
0078         $docNorms     = array();
0079         $similarity   = Zend_Search_Lucene_Search_Similarity::getDefault();
0080 
0081         foreach ($document->getFieldNames() as $fieldName) {
0082             $field = $document->getField($fieldName);
0083 
0084             if ($field->storeTermVector) {
0085                 /**
0086                  * @todo term vector storing support
0087                  */
0088                 // require_once 'Zend/Search/Lucene/Exception.php';
0089                 throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
0090             }
0091 
0092             if ($field->isIndexed) {
0093                 if ($field->isTokenized) {
0094                     /** Zend_Search_Lucene_Analysis_Analyzer */
0095                     // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
0096 
0097                     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
0098                     $analyzer->setInput($field->value, $field->encoding);
0099 
0100                     $position     = 0;
0101                     $tokenCounter = 0;
0102                     while (($token = $analyzer->nextToken()) !== null) {
0103                         $tokenCounter++;
0104 
0105                         $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
0106                         $termKey = $term->key();
0107 
0108                         if (!isset($this->_termDictionary[$termKey])) {
0109                             // New term
0110                             $this->_termDictionary[$termKey] = $term;
0111                             $this->_termDocs[$termKey] = array();
0112                             $this->_termDocs[$termKey][$this->_docCount] = array();
0113                         } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
0114                             // Existing term, but new term entry
0115                             $this->_termDocs[$termKey][$this->_docCount] = array();
0116                         }
0117                         $position += $token->getPositionIncrement();
0118                         $this->_termDocs[$termKey][$this->_docCount][] = $position;
0119                     }
0120 
0121                     if ($tokenCounter == 0) {
0122                         // Field contains empty value. Treat it as non-indexed and non-tokenized
0123                         $field = clone($field);
0124                         $field->isIndexed = $field->isTokenized = false;
0125                     } else {
0126                         $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
0127                                                                                                        $tokenCounter)*
0128                                                                                $document->boost*
0129                                                                                $field->boost ));
0130                     }
0131                 } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
0132                     // Field contains empty value. Treat it as non-indexed and non-tokenized
0133                     $field = clone($field);
0134                     $field->isIndexed = $field->isTokenized = false;
0135                 } else {
0136                     $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
0137                     $termKey = $term->key();
0138 
0139                     if (!isset($this->_termDictionary[$termKey])) {
0140                         // New term
0141                         $this->_termDictionary[$termKey] = $term;
0142                         $this->_termDocs[$termKey] = array();
0143                         $this->_termDocs[$termKey][$this->_docCount] = array();
0144                     } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
0145                         // Existing term, but new term entry
0146                         $this->_termDocs[$termKey][$this->_docCount] = array();
0147                     }
0148                     $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
0149 
0150                     $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
0151                                                                            $document->boost*
0152                                                                            $field->boost ));
0153                 }
0154             }
0155 
0156             if ($field->isStored) {
0157                 $storedFields[] = $field;
0158             }
0159 
0160             $this->addField($field);
0161         }
0162 
0163         foreach ($this->_fields as $fieldName => $field) {
0164             if (!$field->isIndexed) {
0165                 continue;
0166             }
0167 
0168             if (!isset($this->_norms[$fieldName])) {
0169                 $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
0170                                                        $this->_docCount);
0171             }
0172 
0173             if (isset($docNorms[$fieldName])){
0174                 $this->_norms[$fieldName] .= $docNorms[$fieldName];
0175             } else {
0176                 $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
0177             }
0178         }
0179 
0180         $this->addStoredFields($storedFields);
0181     }
0182 
0183 
0184     /**
0185      * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
0186      */
0187     protected function _dumpDictionary()
0188     {
0189         ksort($this->_termDictionary, SORT_STRING);
0190 
0191         $this->initializeDictionaryFiles();
0192 
0193         foreach ($this->_termDictionary as $termId => $term) {
0194             $this->addTerm($term, $this->_termDocs[$termId]);
0195         }
0196 
0197         $this->closeDictionaryFiles();
0198     }
0199 
0200 
0201     /**
0202      * Close segment, write it to disk and return segment info
0203      *
0204      * @return Zend_Search_Lucene_Index_SegmentInfo
0205      */
0206     public function close()
0207     {
0208         if ($this->_docCount == 0) {
0209             return null;
0210         }
0211 
0212         $this->_dumpFNM();
0213         $this->_dumpDictionary();
0214 
0215         $this->_generateCFS();
0216 
0217         /** Zend_Search_Lucene_Index_SegmentInfo */
0218         // require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
0219 
0220         return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
0221                                                         $this->_name,
0222                                                         $this->_docCount,
0223                                                         -1,
0224                                                         null,
0225                                                         true,
0226                                                         true);
0227     }
0228 
0229 }
0230