File indexing completed on 2025-01-19 05:21:25
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Index 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 /** 0024 * Dictionary loader 0025 * 0026 * It's a dummy class which is created to encapsulate non-good structured code. 0027 * Manual "method inlining" is performed to increase dictionary index loading operation 0028 * which is major bottelneck for search performance. 0029 * 0030 * 0031 * @category Zend 0032 * @package Zend_Search_Lucene 0033 * @subpackage Index 0034 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0035 * @license http://framework.zend.com/license/new-bsd New BSD License 0036 */ 0037 class Zend_Search_Lucene_Index_DictionaryLoader 0038 { 0039 /** 0040 * Dictionary index loader. 0041 * 0042 * It takes a string which is actually <segment_name>.tii index file data and 0043 * returns two arrays - term and tremInfo lists. 0044 * 0045 * See Zend_Search_Lucene_Index_SegmintInfo class for details 0046 * 0047 * @param string $data 0048 * @return array 0049 * @throws Zend_Search_Lucene_Exception 0050 */ 0051 public static function load($data) 0052 { 0053 $termDictionary = array(); 0054 $termInfos = array(); 0055 $pos = 0; 0056 0057 // $tiVersion = $tiiFile->readInt(); 0058 $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]); 0059 $pos += 4; 0060 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && 0061 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { 0062 // require_once 'Zend/Search/Lucene/Exception.php'; 0063 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); 0064 } 0065 0066 // $indexTermCount = $tiiFile->readLong(); 0067 if (PHP_INT_SIZE > 4) { 0068 $indexTermCount = ord($data[$pos]) << 56 | 0069 ord($data[$pos+1]) << 48 | 0070 ord($data[$pos+2]) << 40 | 0071 ord($data[$pos+3]) << 32 | 0072 ord($data[$pos+4]) << 24 | 0073 ord($data[$pos+5]) << 16 | 0074 ord($data[$pos+6]) << 8 | 0075 ord($data[$pos+7]); 0076 } else { 0077 if ((ord($data[$pos]) != 0) || 0078 (ord($data[$pos+1]) != 0) || 0079 (ord($data[$pos+2]) != 0) || 0080 (ord($data[$pos+3]) != 0) || 0081 ((ord($data[$pos+4]) & 0x80) != 0)) { 0082 // require_once 'Zend/Search/Lucene/Exception.php'; 0083 throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb'); 0084 } 0085 0086 $indexTermCount = ord($data[$pos+4]) << 24 | 0087 ord($data[$pos+5]) << 16 | 0088 ord($data[$pos+6]) << 8 | 0089 ord($data[$pos+7]); 0090 } 0091 $pos += 8; 0092 0093 // $tiiFile->readInt(); // IndexInterval 0094 $pos += 4; 0095 0096 // $skipInterval = $tiiFile->readInt(); 0097 $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]); 0098 $pos += 4; 0099 if ($indexTermCount < 1) { 0100 // require_once 'Zend/Search/Lucene/Exception.php'; 0101 throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index'); 0102 } 0103 0104 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { 0105 /* Skip MaxSkipLevels value */ 0106 $pos += 4; 0107 } 0108 0109 $prevTerm = ''; 0110 $freqPointer = 0; 0111 $proxPointer = 0; 0112 $indexPointer = 0; 0113 for ($count = 0; $count < $indexTermCount; $count++) { 0114 //$termPrefixLength = $tiiFile->readVInt(); 0115 $nbyte = ord($data[$pos++]); 0116 $termPrefixLength = $nbyte & 0x7F; 0117 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0118 $nbyte = ord($data[$pos++]); 0119 $termPrefixLength |= ($nbyte & 0x7F) << $shift; 0120 } 0121 0122 // $termSuffix = $tiiFile->readString(); 0123 $nbyte = ord($data[$pos++]); 0124 $len = $nbyte & 0x7F; 0125 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0126 $nbyte = ord($data[$pos++]); 0127 $len |= ($nbyte & 0x7F) << $shift; 0128 } 0129 if ($len == 0) { 0130 $termSuffix = ''; 0131 } else { 0132 $termSuffix = substr($data, $pos, $len); 0133 $pos += $len; 0134 for ($count1 = 0; $count1 < $len; $count1++ ) { 0135 if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) { 0136 $addBytes = 1; 0137 if (ord($termSuffix[$count1]) & 0x20 ) { 0138 $addBytes++; 0139 0140 // Never used for Java Lucene created index. 0141 // Java2 doesn't encode strings in four bytes 0142 if (ord($termSuffix[$count1]) & 0x10 ) { 0143 $addBytes++; 0144 } 0145 } 0146 $termSuffix .= substr($data, $pos, $addBytes); 0147 $pos += $addBytes; 0148 $len += $addBytes; 0149 0150 // Check for null character. Java2 encodes null character 0151 // in two bytes. 0152 if (ord($termSuffix[$count1]) == 0xC0 && 0153 ord($termSuffix[$count1+1]) == 0x80 ) { 0154 $termSuffix[$count1] = 0; 0155 $termSuffix = substr($termSuffix,0,$count1+1) 0156 . substr($termSuffix,$count1+2); 0157 } 0158 $count1 += $addBytes; 0159 } 0160 } 0161 } 0162 0163 // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix; 0164 $pb = 0; $pc = 0; 0165 while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) { 0166 $charBytes = 1; 0167 if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) { 0168 $charBytes++; 0169 if (ord($prevTerm[$pb]) & 0x20 ) { 0170 $charBytes++; 0171 if (ord($prevTerm[$pb]) & 0x10 ) { 0172 $charBytes++; 0173 } 0174 } 0175 } 0176 0177 if ($pb + $charBytes > strlen($data)) { 0178 // wrong character 0179 break; 0180 } 0181 0182 $pc++; 0183 $pb += $charBytes; 0184 } 0185 $termValue = substr($prevTerm, 0, $pb) . $termSuffix; 0186 0187 // $termFieldNum = $tiiFile->readVInt(); 0188 $nbyte = ord($data[$pos++]); 0189 $termFieldNum = $nbyte & 0x7F; 0190 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0191 $nbyte = ord($data[$pos++]); 0192 $termFieldNum |= ($nbyte & 0x7F) << $shift; 0193 } 0194 0195 // $docFreq = $tiiFile->readVInt(); 0196 $nbyte = ord($data[$pos++]); 0197 $docFreq = $nbyte & 0x7F; 0198 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0199 $nbyte = ord($data[$pos++]); 0200 $docFreq |= ($nbyte & 0x7F) << $shift; 0201 } 0202 0203 // $freqPointer += $tiiFile->readVInt(); 0204 $nbyte = ord($data[$pos++]); 0205 $vint = $nbyte & 0x7F; 0206 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0207 $nbyte = ord($data[$pos++]); 0208 $vint |= ($nbyte & 0x7F) << $shift; 0209 } 0210 $freqPointer += $vint; 0211 0212 // $proxPointer += $tiiFile->readVInt(); 0213 $nbyte = ord($data[$pos++]); 0214 $vint = $nbyte & 0x7F; 0215 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0216 $nbyte = ord($data[$pos++]); 0217 $vint |= ($nbyte & 0x7F) << $shift; 0218 } 0219 $proxPointer += $vint; 0220 0221 if( $docFreq >= $skipInterval ) { 0222 // $skipDelta = $tiiFile->readVInt(); 0223 $nbyte = ord($data[$pos++]); 0224 $vint = $nbyte & 0x7F; 0225 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0226 $nbyte = ord($data[$pos++]); 0227 $vint |= ($nbyte & 0x7F) << $shift; 0228 } 0229 $skipDelta = $vint; 0230 } else { 0231 $skipDelta = 0; 0232 } 0233 0234 // $indexPointer += $tiiFile->readVInt(); 0235 $nbyte = ord($data[$pos++]); 0236 $vint = $nbyte & 0x7F; 0237 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { 0238 $nbyte = ord($data[$pos++]); 0239 $vint |= ($nbyte & 0x7F) << $shift; 0240 } 0241 $indexPointer += $vint; 0242 0243 0244 // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum); 0245 $termDictionary[] = array($termFieldNum, $termValue); 0246 0247 $termInfos[] = 0248 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); 0249 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); 0250 0251 $prevTerm = $termValue; 0252 } 0253 0254 // Check special index entry mark 0255 if ($termDictionary[0][0] != (int)0xFFFFFFFF) { 0256 // require_once 'Zend/Search/Lucene/Exception.php'; 0257 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); 0258 } 0259 0260 if (PHP_INT_SIZE > 4) { 0261 // Treat 64-bit 0xFFFFFFFF as -1 0262 $termDictionary[0][0] = -1; 0263 } 0264 0265 return array($termDictionary, $termInfos); 0266 } 0267 } 0268