File indexing completed on 2025-01-19 05:21:25
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Analysis 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 0024 /** Zend_Search_Lucene_Analysis_Analyzer_Common */ 0025 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; 0026 0027 0028 /** 0029 * @category Zend 0030 * @package Zend_Search_Lucene 0031 * @subpackage Analysis 0032 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0033 * @license http://framework.zend.com/license/new-bsd New BSD License 0034 */ 0035 0036 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common 0037 { 0038 /** 0039 * Current char position in an UTF-8 stream 0040 * 0041 * @var integer 0042 */ 0043 private $_position; 0044 0045 /** 0046 * Current binary position in an UTF-8 stream 0047 * 0048 * @var integer 0049 */ 0050 private $_bytePosition; 0051 0052 /** 0053 * Object constructor 0054 * 0055 * @throws Zend_Search_Lucene_Exception 0056 */ 0057 public function __construct() 0058 { 0059 if (@preg_match('/\pL/u', 'a') != 1) { 0060 // PCRE unicode support is turned off 0061 // require_once 'Zend/Search/Lucene/Exception.php'; 0062 throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.'); 0063 } 0064 } 0065 0066 /** 0067 * Reset token stream 0068 */ 0069 public function reset() 0070 { 0071 $this->_position = 0; 0072 $this->_bytePosition = 0; 0073 0074 // convert input into UTF-8 0075 if (strcasecmp($this->_encoding, 'utf8' ) != 0 && 0076 strcasecmp($this->_encoding, 'utf-8') != 0 ) { 0077 $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); 0078 $this->_encoding = 'UTF-8'; 0079 } 0080 } 0081 0082 /** 0083 * Tokenization stream API 0084 * Get next token 0085 * Returns null at the end of stream 0086 * 0087 * @return Zend_Search_Lucene_Analysis_Token|null 0088 */ 0089 public function nextToken() 0090 { 0091 if ($this->_input === null) { 0092 return null; 0093 } 0094 0095 do { 0096 if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { 0097 // It covers both cases a) there are no matches (preg_match(...) === 0) 0098 // b) error occured (preg_match(...) === FALSE) 0099 return null; 0100 } 0101 0102 // matched string 0103 $matchedWord = $match[0][0]; 0104 0105 // binary position of the matched word in the input stream 0106 $binStartPos = $match[0][1]; 0107 0108 // character position of the matched word in the input stream 0109 $startPos = $this->_position + 0110 iconv_strlen(substr($this->_input, 0111 $this->_bytePosition, 0112 $binStartPos - $this->_bytePosition), 0113 'UTF-8'); 0114 // character postion of the end of matched word in the input stream 0115 $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); 0116 0117 $this->_bytePosition = $binStartPos + strlen($matchedWord); 0118 $this->_position = $endPos; 0119 0120 $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos)); 0121 } while ($token === null); // try again if token is skipped 0122 0123 return $token; 0124 } 0125 } 0126