File indexing completed on 2025-01-19 05:21:25

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Index
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022 
0023 /**
0024  * Dictionary loader
0025  *
0026  * It's a dummy class which is created to encapsulate non-good structured code.
0027  * Manual "method inlining" is performed to increase dictionary index loading operation
0028  * which is major bottelneck for search performance.
0029  *
0030  *
0031  * @category   Zend
0032  * @package    Zend_Search_Lucene
0033  * @subpackage Index
0034  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0035  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0036  */
0037 class Zend_Search_Lucene_Index_DictionaryLoader
0038 {
0039     /**
0040      * Dictionary index loader.
0041      *
0042      * It takes a string which is actually <segment_name>.tii index file data and
0043      * returns two arrays - term and tremInfo lists.
0044      *
0045      * See Zend_Search_Lucene_Index_SegmintInfo class for details
0046      *
0047      * @param string $data
0048      * @return array
0049      * @throws Zend_Search_Lucene_Exception
0050      */
0051     public static function load($data)
0052     {
0053         $termDictionary = array();
0054         $termInfos      = array();
0055         $pos = 0;
0056 
0057         // $tiVersion = $tiiFile->readInt();
0058         $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
0059         $pos += 4;
0060         if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
0061             $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
0062                 // require_once 'Zend/Search/Lucene/Exception.php';
0063                 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
0064         }
0065 
0066         // $indexTermCount = $tiiFile->readLong();
0067         if (PHP_INT_SIZE > 4) {
0068             $indexTermCount = ord($data[$pos]) << 56  |
0069                               ord($data[$pos+1]) << 48  |
0070                               ord($data[$pos+2]) << 40  |
0071                               ord($data[$pos+3]) << 32  |
0072                               ord($data[$pos+4]) << 24  |
0073                               ord($data[$pos+5]) << 16  |
0074                               ord($data[$pos+6]) << 8   |
0075                               ord($data[$pos+7]);
0076         } else {
0077             if ((ord($data[$pos])            != 0) ||
0078                 (ord($data[$pos+1])          != 0) ||
0079                 (ord($data[$pos+2])          != 0) ||
0080                 (ord($data[$pos+3])          != 0) ||
0081                 ((ord($data[$pos+4]) & 0x80) != 0)) {
0082                     // require_once 'Zend/Search/Lucene/Exception.php';
0083                     throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
0084                  }
0085 
0086             $indexTermCount = ord($data[$pos+4]) << 24  |
0087                               ord($data[$pos+5]) << 16  |
0088                               ord($data[$pos+6]) << 8   |
0089                               ord($data[$pos+7]);
0090         }
0091         $pos += 8;
0092 
0093         //                  $tiiFile->readInt();  // IndexInterval
0094         $pos += 4;
0095 
0096         // $skipInterval   = $tiiFile->readInt();
0097         $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
0098         $pos += 4;
0099         if ($indexTermCount < 1) {
0100             // require_once 'Zend/Search/Lucene/Exception.php';
0101             throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
0102         }
0103 
0104         if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
0105             /* Skip MaxSkipLevels value */
0106             $pos += 4;
0107         }
0108 
0109         $prevTerm     = '';
0110         $freqPointer  =  0;
0111         $proxPointer  =  0;
0112         $indexPointer =  0;
0113         for ($count = 0; $count < $indexTermCount; $count++) {
0114             //$termPrefixLength = $tiiFile->readVInt();
0115             $nbyte = ord($data[$pos++]);
0116             $termPrefixLength = $nbyte & 0x7F;
0117             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0118                 $nbyte = ord($data[$pos++]);
0119                 $termPrefixLength |= ($nbyte & 0x7F) << $shift;
0120             }
0121 
0122             // $termSuffix       = $tiiFile->readString();
0123             $nbyte = ord($data[$pos++]);
0124             $len = $nbyte & 0x7F;
0125             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0126                 $nbyte = ord($data[$pos++]);
0127                 $len |= ($nbyte & 0x7F) << $shift;
0128             }
0129             if ($len == 0) {
0130                 $termSuffix = '';
0131             } else {
0132                 $termSuffix = substr($data, $pos, $len);
0133                 $pos += $len;
0134                 for ($count1 = 0; $count1 < $len; $count1++ ) {
0135                     if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
0136                         $addBytes = 1;
0137                         if (ord($termSuffix[$count1]) & 0x20 ) {
0138                             $addBytes++;
0139 
0140                             // Never used for Java Lucene created index.
0141                             // Java2 doesn't encode strings in four bytes
0142                             if (ord($termSuffix[$count1]) & 0x10 ) {
0143                                 $addBytes++;
0144                             }
0145                         }
0146                         $termSuffix .= substr($data, $pos, $addBytes);
0147                         $pos += $addBytes;
0148                         $len += $addBytes;
0149 
0150                         // Check for null character. Java2 encodes null character
0151                         // in two bytes.
0152                         if (ord($termSuffix[$count1]) == 0xC0 &&
0153                             ord($termSuffix[$count1+1]) == 0x80   ) {
0154                             $termSuffix[$count1] = 0;
0155                             $termSuffix = substr($termSuffix,0,$count1+1)
0156                                         . substr($termSuffix,$count1+2);
0157                         }
0158                         $count1 += $addBytes;
0159                     }
0160                 }
0161             }
0162 
0163             // $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
0164             $pb = 0; $pc = 0;
0165             while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
0166                 $charBytes = 1;
0167                 if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
0168                     $charBytes++;
0169                     if (ord($prevTerm[$pb]) & 0x20 ) {
0170                         $charBytes++;
0171                         if (ord($prevTerm[$pb]) & 0x10 ) {
0172                             $charBytes++;
0173                         }
0174                     }
0175                 }
0176 
0177                 if ($pb + $charBytes > strlen($data)) {
0178                     // wrong character
0179                     break;
0180                 }
0181 
0182                 $pc++;
0183                 $pb += $charBytes;
0184             }
0185             $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
0186 
0187             // $termFieldNum     = $tiiFile->readVInt();
0188             $nbyte = ord($data[$pos++]);
0189             $termFieldNum = $nbyte & 0x7F;
0190             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0191                 $nbyte = ord($data[$pos++]);
0192                 $termFieldNum |= ($nbyte & 0x7F) << $shift;
0193             }
0194 
0195             // $docFreq          = $tiiFile->readVInt();
0196             $nbyte = ord($data[$pos++]);
0197             $docFreq = $nbyte & 0x7F;
0198             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0199                 $nbyte = ord($data[$pos++]);
0200                 $docFreq |= ($nbyte & 0x7F) << $shift;
0201             }
0202 
0203             // $freqPointer     += $tiiFile->readVInt();
0204             $nbyte = ord($data[$pos++]);
0205             $vint = $nbyte & 0x7F;
0206             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0207                 $nbyte = ord($data[$pos++]);
0208                 $vint |= ($nbyte & 0x7F) << $shift;
0209             }
0210             $freqPointer += $vint;
0211 
0212             // $proxPointer     += $tiiFile->readVInt();
0213             $nbyte = ord($data[$pos++]);
0214             $vint = $nbyte & 0x7F;
0215             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0216                 $nbyte = ord($data[$pos++]);
0217                 $vint |= ($nbyte & 0x7F) << $shift;
0218             }
0219             $proxPointer += $vint;
0220 
0221             if( $docFreq >= $skipInterval ) {
0222                 // $skipDelta = $tiiFile->readVInt();
0223                 $nbyte = ord($data[$pos++]);
0224                 $vint = $nbyte & 0x7F;
0225                 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0226                     $nbyte = ord($data[$pos++]);
0227                     $vint |= ($nbyte & 0x7F) << $shift;
0228                 }
0229                 $skipDelta = $vint;
0230             } else {
0231                 $skipDelta = 0;
0232             }
0233 
0234             // $indexPointer += $tiiFile->readVInt();
0235             $nbyte = ord($data[$pos++]);
0236             $vint = $nbyte & 0x7F;
0237             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
0238                 $nbyte = ord($data[$pos++]);
0239                 $vint |= ($nbyte & 0x7F) << $shift;
0240             }
0241             $indexPointer += $vint;
0242 
0243 
0244             // $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
0245             $termDictionary[] = array($termFieldNum, $termValue);
0246 
0247             $termInfos[] =
0248                  // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
0249                  array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
0250 
0251             $prevTerm = $termValue;
0252         }
0253 
0254         // Check special index entry mark
0255         if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
0256             // require_once 'Zend/Search/Lucene/Exception.php';
0257             throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
0258         }
0259 
0260         if (PHP_INT_SIZE > 4) {
0261             // Treat 64-bit 0xFFFFFFFF as -1
0262             $termDictionary[0][0] = -1;
0263         }
0264 
0265         return array($termDictionary, $termInfos);
0266     }
0267 }
0268