File indexing completed on 2025-01-19 05:21:25
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Document 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 0024 /** Zend_Search_Lucene_Document */ 0025 // require_once 'Zend/Search/Lucene/Document.php'; 0026 0027 0028 /** 0029 * HTML document. 0030 * 0031 * @category Zend 0032 * @package Zend_Search_Lucene 0033 * @subpackage Document 0034 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0035 * @license http://framework.zend.com/license/new-bsd New BSD License 0036 */ 0037 class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document 0038 { 0039 /** 0040 * List of document links 0041 * 0042 * @var array 0043 */ 0044 private $_links = array(); 0045 0046 /** 0047 * List of document header links 0048 * 0049 * @var array 0050 */ 0051 private $_headerLinks = array(); 0052 0053 /** 0054 * Stored DOM representation 0055 * 0056 * @var DOMDocument 0057 */ 0058 private $_doc; 0059 0060 /** 0061 * Exclud nofollow links flag 0062 * 0063 * If true then links with rel='nofollow' attribute are not included into 0064 * document links. 0065 * 0066 * @var boolean 0067 */ 0068 private static $_excludeNoFollowLinks = false; 0069 0070 /** 0071 * 0072 * List of inline tags 0073 * 0074 * @var array 0075 */ 0076 private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code', 0077 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike', 0078 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins', 0079 'q', 'sub', 'sup'); 0080 0081 /** 0082 * Object constructor 0083 * 0084 * @param string $data HTML string (may be HTML fragment, ) 0085 * @param boolean $isFile 0086 * @param boolean $storeContent 0087 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 0088 */ 0089 private function __construct($data, $isFile, $storeContent, $defaultEncoding = '') 0090 { 0091 $this->_doc = new DOMDocument(); 0092 $this->_doc->substituteEntities = true; 0093 0094 if ($isFile) { 0095 $htmlData = file_get_contents($data); 0096 } else { 0097 $htmlData = $data; 0098 } 0099 @$this->_doc->loadHTML($htmlData); 0100 0101 if ($this->_doc->encoding === null) { 0102 // Document encoding is not recognized 0103 0104 /** @todo improve HTML vs HTML fragment recognition */ 0105 if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { 0106 // It's an HTML document 0107 // Add additional HEAD section and recognize document 0108 $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); 0109 0110 @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) 0111 . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' 0112 . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); 0113 0114 // Remove additional HEAD section 0115 $xpath = new DOMXPath($this->_doc); 0116 $head = $xpath->query('/html/head')->item(0); 0117 $head->parentNode->removeChild($head); 0118 } else { 0119 // It's an HTML fragment 0120 @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' 0121 . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) 0122 . '</body></html>'); 0123 } 0124 0125 } 0126 /** @todo Add correction of wrong HTML encoding recognition processing 0127 * The case is: 0128 * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, 0129 * even $this->_doc->encoding demonstrates another recognized encoding 0130 */ 0131 0132 $xpath = new DOMXPath($this->_doc); 0133 0134 $docTitle = ''; 0135 $titleNodes = $xpath->query('/html/head/title'); 0136 foreach ($titleNodes as $titleNode) { 0137 // title should always have only one entry, but we process all nodeset entries 0138 $docTitle .= $titleNode->nodeValue . ' '; 0139 } 0140 $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8')); 0141 0142 $metaNodes = $xpath->query('/html/head/meta[@name]'); 0143 foreach ($metaNodes as $metaNode) { 0144 $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), 0145 $metaNode->getAttribute('content'), 0146 'UTF-8')); 0147 } 0148 0149 $docBody = ''; 0150 $bodyNodes = $xpath->query('/html/body'); 0151 foreach ($bodyNodes as $bodyNode) { 0152 // body should always have only one entry, but we process all nodeset entries 0153 $this->_retrieveNodeText($bodyNode, $docBody); 0154 } 0155 if ($storeContent) { 0156 $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8')); 0157 } else { 0158 $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8')); 0159 } 0160 0161 $linkNodes = $this->_doc->getElementsByTagName('a'); 0162 foreach ($linkNodes as $linkNode) { 0163 if (($href = $linkNode->getAttribute('href')) != '' && 0164 (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) 0165 ) { 0166 $this->_links[] = $href; 0167 } 0168 } 0169 $linkNodes = $this->_doc->getElementsByTagName('area'); 0170 foreach ($linkNodes as $linkNode) { 0171 if (($href = $linkNode->getAttribute('href')) != '' && 0172 (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) 0173 ) { 0174 $this->_links[] = $href; 0175 } 0176 } 0177 $this->_links = array_unique($this->_links); 0178 0179 $linkNodes = $xpath->query('/html/head/link'); 0180 foreach ($linkNodes as $linkNode) { 0181 if (($href = $linkNode->getAttribute('href')) != '') { 0182 $this->_headerLinks[] = $href; 0183 } 0184 } 0185 $this->_headerLinks = array_unique($this->_headerLinks); 0186 } 0187 0188 /** 0189 * Set exclude nofollow links flag 0190 * 0191 * @param boolean $newValue 0192 */ 0193 public static function setExcludeNoFollowLinks($newValue) 0194 { 0195 self::$_excludeNoFollowLinks = $newValue; 0196 } 0197 0198 /** 0199 * Get exclude nofollow links flag 0200 * 0201 * @return boolean 0202 */ 0203 public static function getExcludeNoFollowLinks() 0204 { 0205 return self::$_excludeNoFollowLinks; 0206 } 0207 0208 /** 0209 * Get node text 0210 * 0211 * We should exclude scripts, which may be not included into comment tags, CDATA sections, 0212 * 0213 * @param DOMNode $node 0214 * @param string &$text 0215 */ 0216 private function _retrieveNodeText(DOMNode $node, &$text) 0217 { 0218 if ($node->nodeType == XML_TEXT_NODE) { 0219 $text .= $node->nodeValue; 0220 if(!in_array($node->parentNode->tagName, $this->_inlineTags)) { 0221 $text .= ' '; 0222 } 0223 } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { 0224 foreach ($node->childNodes as $childNode) { 0225 $this->_retrieveNodeText($childNode, $text); 0226 } 0227 } 0228 } 0229 0230 /** 0231 * Get document HREF links 0232 * 0233 * @return array 0234 */ 0235 public function getLinks() 0236 { 0237 return $this->_links; 0238 } 0239 0240 /** 0241 * Get document header links 0242 * 0243 * @return array 0244 */ 0245 public function getHeaderLinks() 0246 { 0247 return $this->_headerLinks; 0248 } 0249 0250 /** 0251 * Load HTML document from a string 0252 * 0253 * @param string $data 0254 * @param boolean $storeContent 0255 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 0256 * @return Zend_Search_Lucene_Document_Html 0257 */ 0258 public static function loadHTML($data, $storeContent = false, $defaultEncoding = '') 0259 { 0260 return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding); 0261 } 0262 0263 /** 0264 * Load HTML document from a file 0265 * 0266 * @param string $file 0267 * @param boolean $storeContent 0268 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 0269 * @return Zend_Search_Lucene_Document_Html 0270 */ 0271 public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '') 0272 { 0273 return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding); 0274 } 0275 0276 0277 /** 0278 * Highlight text in text node 0279 * 0280 * @param DOMText $node 0281 * @param array $wordsToHighlight 0282 * @param callback $callback Callback method, used to transform (highlighting) text. 0283 * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) 0284 * @throws Zend_Search_Lucene_Exception 0285 */ 0286 protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params) 0287 { 0288 /** Zend_Search_Lucene_Analysis_Analyzer */ 0289 // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; 0290 0291 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 0292 $analyzer->setInput($node->nodeValue, 'UTF-8'); 0293 0294 $matchedTokens = array(); 0295 0296 while (($token = $analyzer->nextToken()) !== null) { 0297 if (isset($wordsToHighlight[$token->getTermText()])) { 0298 $matchedTokens[] = $token; 0299 } 0300 } 0301 0302 if (count($matchedTokens) == 0) { 0303 return; 0304 } 0305 0306 $matchedTokens = array_reverse($matchedTokens); 0307 0308 foreach ($matchedTokens as $token) { 0309 // Cut text after matched token 0310 $node->splitText($token->getEndOffset()); 0311 0312 // Cut matched node 0313 $matchedWordNode = $node->splitText($token->getStartOffset()); 0314 0315 // Retrieve HTML string representation for highlihted word 0316 $fullCallbackparamsList = $params; 0317 array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue); 0318 $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList); 0319 0320 // Transform HTML string to a DOM representation and automatically transform retrieved string 0321 // into valid XHTML (It's automatically done by loadHTML() method) 0322 $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8'); 0323 $success = @$highlightedWordNodeSetDomDocument-> 0324 loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>' 0325 . $highlightedWordNodeSetHtml 0326 . '</body></html>'); 0327 if (!$success) { 0328 // require_once 'Zend/Search/Lucene/Exception.php'; 0329 throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'."); 0330 } 0331 $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument); 0332 $highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes; 0333 0334 for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) { 0335 $nodeToImport = $highlightedWordNodeSet->item($count); 0336 $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */), 0337 $matchedWordNode); 0338 } 0339 0340 $node->parentNode->removeChild($matchedWordNode); 0341 } 0342 } 0343 0344 0345 /** 0346 * highlight words in content of the specified node 0347 * 0348 * @param DOMNode $contextNode 0349 * @param array $wordsToHighlight 0350 * @param callback $callback Callback method, used to transform (highlighting) text. 0351 * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) 0352 */ 0353 protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params) 0354 { 0355 $textNodes = array(); 0356 0357 if (!$contextNode->hasChildNodes()) { 0358 return; 0359 } 0360 0361 foreach ($contextNode->childNodes as $childNode) { 0362 if ($childNode->nodeType == XML_TEXT_NODE) { 0363 // process node later to leave childNodes structure untouched 0364 $textNodes[] = $childNode; 0365 } else { 0366 // Process node if it's not a script node 0367 if ($childNode->nodeName != 'script') { 0368 $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params); 0369 } 0370 } 0371 } 0372 0373 foreach ($textNodes as $textNode) { 0374 $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params); 0375 } 0376 } 0377 0378 /** 0379 * Standard callback method used to highlight words. 0380 * 0381 * @param string $stringToHighlight 0382 * @return string 0383 * @internal 0384 */ 0385 public function applyColour($stringToHighlight, $colour) 0386 { 0387 return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>'; 0388 } 0389 0390 /** 0391 * Highlight text with specified color 0392 * 0393 * @param string|array $words 0394 * @param string $colour 0395 * @return string 0396 */ 0397 public function highlight($words, $colour = '#66ffff') 0398 { 0399 return $this->highlightExtended($words, array($this, 'applyColour'), array($colour)); 0400 } 0401 0402 0403 0404 /** 0405 * Highlight text using specified View helper or callback function. 0406 * 0407 * @param string|array $words Words to highlight. Words could be organized using the array or string. 0408 * @param callback $callback Callback method, used to transform (highlighting) text. 0409 * @param array $params Array of additionall callback parameters passed through into it 0410 * (first non-optional parameter is an HTML fragment for highlighting) 0411 * @return string 0412 * @throws Zend_Search_Lucene_Exception 0413 */ 0414 public function highlightExtended($words, $callback, $params = array()) 0415 { 0416 /** Zend_Search_Lucene_Analysis_Analyzer */ 0417 // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; 0418 0419 if (!is_array($words)) { 0420 $words = array($words); 0421 } 0422 0423 $wordsToHighlightList = array(); 0424 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); 0425 foreach ($words as $wordString) { 0426 $wordsToHighlightList[] = $analyzer->tokenize($wordString); 0427 } 0428 $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); 0429 0430 if (count($wordsToHighlight) == 0) { 0431 return $this->_doc->saveHTML(); 0432 } 0433 0434 $wordsToHighlightFlipped = array(); 0435 foreach ($wordsToHighlight as $id => $token) { 0436 $wordsToHighlightFlipped[$token->getTermText()] = $id; 0437 } 0438 0439 if (!is_callable($callback)) { 0440 // require_once 'Zend/Search/Lucene/Exception.php'; 0441 throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.'); 0442 } 0443 0444 $xpath = new DOMXPath($this->_doc); 0445 0446 $matchedNodes = $xpath->query("/html/body"); 0447 foreach ($matchedNodes as $matchedNode) { 0448 $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); 0449 } 0450 } 0451 0452 0453 /** 0454 * Get HTML 0455 * 0456 * @return string 0457 */ 0458 public function getHTML() 0459 { 0460 return $this->_doc->saveHTML(); 0461 } 0462 0463 /** 0464 * Get HTML body 0465 * 0466 * @return string 0467 */ 0468 public function getHtmlBody() 0469 { 0470 $xpath = new DOMXPath($this->_doc); 0471 $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes; 0472 0473 $outputFragments = array(); 0474 for ($count = 0; $count < $bodyNodes->length; $count++) { 0475 $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count)); 0476 } 0477 0478 return implode($outputFragments); 0479 } 0480 } 0481