File indexing completed on 2025-03-02 05:29:43
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Index 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 /** Zend_Search_Lucene_Index_SegmentWriter */ 0024 // require_once 'Zend/Search/Lucene/Index/SegmentWriter.php'; 0025 0026 /** 0027 * @category Zend 0028 * @package Zend_Search_Lucene 0029 * @subpackage Index 0030 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0031 * @license http://framework.zend.com/license/new-bsd New BSD License 0032 */ 0033 class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter 0034 { 0035 /** 0036 * Term Dictionary 0037 * Array of the Zend_Search_Lucene_Index_Term objects 0038 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos 0039 * 0040 * @var array 0041 */ 0042 protected $_termDictionary; 0043 0044 /** 0045 * Documents, which contain the term 0046 * 0047 * @var array 0048 */ 0049 protected $_termDocs; 0050 0051 /** 0052 * Object constructor. 0053 * 0054 * @param Zend_Search_Lucene_Storage_Directory $directory 0055 * @param string $name 0056 */ 0057 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) 0058 { 0059 parent::__construct($directory, $name); 0060 0061 $this->_termDocs = array(); 0062 $this->_termDictionary = array(); 0063 } 0064 0065 0066 /** 0067 * Adds a document to this segment. 0068 * 0069 * @param Zend_Search_Lucene_Document $document 0070 * @throws Zend_Search_Lucene_Exception 0071 */ 0072 public function addDocument(Zend_Search_Lucene_Document $document) 0073 { 0074 /** Zend_Search_Lucene_Search_Similarity */ 0075 // require_once 'Zend/Search/Lucene/Search/Similarity.php'; 0076 0077 $storedFields = array(); 0078 $docNorms = array(); 0079 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); 0080 0081 foreach ($document->getFieldNames() as $fieldName) { 0082 $field = $document->getField($fieldName); 0083 0084 if ($field->storeTermVector) { 0085 /** 0086 * @todo term vector storing support 0087 */ 0088 // require_once 'Zend/Search/Lucene/Exception.php'; 0089 throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); 0090 } 0091 0092 if ($field->isIndexed) { 0093 if ($field->isTokenized) { 0094 /** Zend_Search_Lucene_Analysis_Analyzer */ 0095 // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; 0096 0097 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 0098 $analyzer->setInput($field->value, $field->encoding); 0099 0100 $position = 0; 0101 $tokenCounter = 0; 0102 while (($token = $analyzer->nextToken()) !== null) { 0103 $tokenCounter++; 0104 0105 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); 0106 $termKey = $term->key(); 0107 0108 if (!isset($this->_termDictionary[$termKey])) { 0109 // New term 0110 $this->_termDictionary[$termKey] = $term; 0111 $this->_termDocs[$termKey] = array(); 0112 $this->_termDocs[$termKey][$this->_docCount] = array(); 0113 } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { 0114 // Existing term, but new term entry 0115 $this->_termDocs[$termKey][$this->_docCount] = array(); 0116 } 0117 $position += $token->getPositionIncrement(); 0118 $this->_termDocs[$termKey][$this->_docCount][] = $position; 0119 } 0120 0121 if ($tokenCounter == 0) { 0122 // Field contains empty value. Treat it as non-indexed and non-tokenized 0123 $field = clone($field); 0124 $field->isIndexed = $field->isTokenized = false; 0125 } else { 0126 $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 0127 $tokenCounter)* 0128 $document->boost* 0129 $field->boost )); 0130 } 0131 } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') { 0132 // Field contains empty value. Treat it as non-indexed and non-tokenized 0133 $field = clone($field); 0134 $field->isIndexed = $field->isTokenized = false; 0135 } else { 0136 $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name); 0137 $termKey = $term->key(); 0138 0139 if (!isset($this->_termDictionary[$termKey])) { 0140 // New term 0141 $this->_termDictionary[$termKey] = $term; 0142 $this->_termDocs[$termKey] = array(); 0143 $this->_termDocs[$termKey][$this->_docCount] = array(); 0144 } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { 0145 // Existing term, but new term entry 0146 $this->_termDocs[$termKey][$this->_docCount] = array(); 0147 } 0148 $this->_termDocs[$termKey][$this->_docCount][] = 0; // position 0149 0150 $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)* 0151 $document->boost* 0152 $field->boost )); 0153 } 0154 } 0155 0156 if ($field->isStored) { 0157 $storedFields[] = $field; 0158 } 0159 0160 $this->addField($field); 0161 } 0162 0163 foreach ($this->_fields as $fieldName => $field) { 0164 if (!$field->isIndexed) { 0165 continue; 0166 } 0167 0168 if (!isset($this->_norms[$fieldName])) { 0169 $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), 0170 $this->_docCount); 0171 } 0172 0173 if (isset($docNorms[$fieldName])){ 0174 $this->_norms[$fieldName] .= $docNorms[$fieldName]; 0175 } else { 0176 $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )); 0177 } 0178 } 0179 0180 $this->addStoredFields($storedFields); 0181 } 0182 0183 0184 /** 0185 * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files 0186 */ 0187 protected function _dumpDictionary() 0188 { 0189 ksort($this->_termDictionary, SORT_STRING); 0190 0191 $this->initializeDictionaryFiles(); 0192 0193 foreach ($this->_termDictionary as $termId => $term) { 0194 $this->addTerm($term, $this->_termDocs[$termId]); 0195 } 0196 0197 $this->closeDictionaryFiles(); 0198 } 0199 0200 0201 /** 0202 * Close segment, write it to disk and return segment info 0203 * 0204 * @return Zend_Search_Lucene_Index_SegmentInfo 0205 */ 0206 public function close() 0207 { 0208 if ($this->_docCount == 0) { 0209 return null; 0210 } 0211 0212 $this->_dumpFNM(); 0213 $this->_dumpDictionary(); 0214 0215 $this->_generateCFS(); 0216 0217 /** Zend_Search_Lucene_Index_SegmentInfo */ 0218 // require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; 0219 0220 return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory, 0221 $this->_name, 0222 $this->_docCount, 0223 -1, 0224 null, 0225 true, 0226 true); 0227 } 0228 0229 } 0230