File indexing completed on 2025-01-19 05:21:25
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Analysis 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 0024 /** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */ 0025 /** @todo Section should be removed with ZF 2.0 release as obsolete */ 0026 if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) { 0027 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */ 0028 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php'; 0029 0030 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */ 0031 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php'; 0032 0033 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */ 0034 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php'; 0035 0036 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */ 0037 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php'; 0038 0039 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ 0040 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; 0041 0042 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ 0043 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; 0044 0045 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */ 0046 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php'; 0047 0048 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */ 0049 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php'; 0050 } 0051 0052 0053 /** 0054 * An Analyzer is used to analyze text. 0055 * It thus represents a policy for extracting index terms from text. 0056 * 0057 * Note: 0058 * Lucene Java implementation is oriented to streams. It provides effective work 0059 * with a huge documents (more then 20Mb). 0060 * But engine itself is not oriented such documents. 0061 * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays). 0062 * 0063 * @category Zend 0064 * @package Zend_Search_Lucene 0065 * @subpackage Analysis 0066 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0067 * @license http://framework.zend.com/license/new-bsd New BSD License 0068 */ 0069 0070 abstract class Zend_Search_Lucene_Analysis_Analyzer 0071 { 0072 /** 0073 * The Analyzer implementation used by default. 0074 * 0075 * @var Zend_Search_Lucene_Analysis_Analyzer 0076 */ 0077 private static $_defaultImpl; 0078 0079 /** 0080 * Input string 0081 * 0082 * @var string 0083 */ 0084 protected $_input = null; 0085 0086 /** 0087 * Input string encoding 0088 * 0089 * @var string 0090 */ 0091 protected $_encoding = ''; 0092 0093 /** 0094 * Tokenize text to a terms 0095 * Returns array of Zend_Search_Lucene_Analysis_Token objects 0096 * 0097 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) 0098 * 0099 * @param string $data 0100 * @return array 0101 */ 0102 public function tokenize($data, $encoding = '') 0103 { 0104 $this->setInput($data, $encoding); 0105 0106 $tokenList = array(); 0107 while (($nextToken = $this->nextToken()) !== null) { 0108 $tokenList[] = $nextToken; 0109 } 0110 0111 return $tokenList; 0112 } 0113 0114 0115 /** 0116 * Tokenization stream API 0117 * Set input 0118 * 0119 * @param string $data 0120 */ 0121 public function setInput($data, $encoding = '') 0122 { 0123 $this->_input = $data; 0124 $this->_encoding = $encoding; 0125 $this->reset(); 0126 } 0127 0128 /** 0129 * Reset token stream 0130 */ 0131 abstract public function reset(); 0132 0133 /** 0134 * Tokenization stream API 0135 * Get next token 0136 * Returns null at the end of stream 0137 * 0138 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) 0139 * 0140 * @return Zend_Search_Lucene_Analysis_Token|null 0141 */ 0142 abstract public function nextToken(); 0143 0144 0145 0146 0147 /** 0148 * Set the default Analyzer implementation used by indexing code. 0149 * 0150 * @param Zend_Search_Lucene_Analysis_Analyzer $similarity 0151 */ 0152 public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) 0153 { 0154 self::$_defaultImpl = $analyzer; 0155 } 0156 0157 0158 /** 0159 * Return the default Analyzer implementation used by indexing code. 0160 * 0161 * @return Zend_Search_Lucene_Analysis_Analyzer 0162 */ 0163 public static function getDefault() 0164 { 0165 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ 0166 // require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; 0167 0168 if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { 0169 self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); 0170 } 0171 0172 return self::$_defaultImpl; 0173 } 0174 } 0175