File indexing completed on 2025-01-19 05:21:25

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Analysis
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022 
0023 
0024 /** Zend_Search_Lucene_Analysis_Analyzer_Common */
0025 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
0026 
0027 
0028 /**
0029  * @category   Zend
0030  * @package    Zend_Search_Lucene
0031  * @subpackage Analysis
0032  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0033  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0034  */
0035 
0036 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
0037 {
0038     /**
0039      * Current char position in an UTF-8 stream
0040      *
0041      * @var integer
0042      */
0043     private $_position;
0044 
0045     /**
0046      * Current binary position in an UTF-8 stream
0047      *
0048      * @var integer
0049      */
0050     private $_bytePosition;
0051 
0052     /**
0053      * Object constructor
0054      *
0055      * @throws Zend_Search_Lucene_Exception
0056      */
0057     public function __construct()
0058     {
0059         if (@preg_match('/\pL/u', 'a') != 1) {
0060             // PCRE unicode support is turned off
0061             // require_once 'Zend/Search/Lucene/Exception.php';
0062             throw new Zend_Search_Lucene_Exception('Utf8Num analyzer needs PCRE unicode support to be enabled.');
0063         }
0064     }
0065 
0066     /**
0067      * Reset token stream
0068      */
0069     public function reset()
0070     {
0071         $this->_position     = 0;
0072         $this->_bytePosition = 0;
0073 
0074         // convert input into UTF-8
0075         if (strcasecmp($this->_encoding, 'utf8' ) != 0  &&
0076             strcasecmp($this->_encoding, 'utf-8') != 0 ) {
0077                 $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
0078                 $this->_encoding = 'UTF-8';
0079         }
0080     }
0081 
0082     /**
0083      * Tokenization stream API
0084      * Get next token
0085      * Returns null at the end of stream
0086      *
0087      * @return Zend_Search_Lucene_Analysis_Token|null
0088      */
0089     public function nextToken()
0090     {
0091         if ($this->_input === null) {
0092             return null;
0093         }
0094 
0095         do {
0096             if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
0097                 // It covers both cases a) there are no matches (preg_match(...) === 0)
0098                 // b) error occured (preg_match(...) === FALSE)
0099                 return null;
0100             }
0101 
0102             // matched string
0103             $matchedWord = $match[0][0];
0104 
0105             // binary position of the matched word in the input stream
0106             $binStartPos = $match[0][1];
0107 
0108             // character position of the matched word in the input stream
0109             $startPos = $this->_position +
0110                         iconv_strlen(substr($this->_input,
0111                                             $this->_bytePosition,
0112                                             $binStartPos - $this->_bytePosition),
0113                                      'UTF-8');
0114             // character postion of the end of matched word in the input stream
0115             $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
0116 
0117             $this->_bytePosition = $binStartPos + strlen($matchedWord);
0118             $this->_position     = $endPos;
0119 
0120             $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
0121         } while ($token === null); // try again if token is skipped
0122 
0123         return $token;
0124     }
0125 }
0126