Lucene/Document/Html.php

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Document
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022
0023
0024 /** Zend_Search_Lucene_Document */
0025 // require_once 'Zend/Search/Lucene/Document.php';
0026
0027
0028 /**
0029  * HTML document.
0030  *
0031  * @category   Zend
0032  * @package    Zend_Search_Lucene
0033  * @subpackage Document
0034  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0035  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0036  */
0037 class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
0038 {
0039     /**
0040      * List of document links
0041      *
0042      * @var array
0043      */
0044     private $_links = array();
0045
0046     /**
0047      * List of document header links
0048      *
0049      * @var array
0050      */
0051     private $_headerLinks = array();
0052
0053     /**
0054      * Stored DOM representation
0055      *
0056      * @var DOMDocument
0057      */
0058     private $_doc;
0059
0060     /**
0061      * Exclud nofollow links flag
0062      *
0063      * If true then links with rel='nofollow' attribute are not included into
0064      * document links.
0065      *
0066      * @var boolean
0067      */
0068     private static $_excludeNoFollowLinks = false;
0069
0070     /**
0071      *
0072      * List of inline tags
0073      *
0074      * @var array
0075      */
0076     private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
0077                                 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
0078                                 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
0079                                 'q', 'sub', 'sup');
0080
0081     /**
0082      * Object constructor
0083      *
0084      * @param string  $data         HTML string (may be HTML fragment, )
0085      * @param boolean $isFile
0086      * @param boolean $storeContent
0087      * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
0088      */
0089     private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
0090     {
0091         $this->_doc = new DOMDocument();
0092         $this->_doc->substituteEntities = true;
0093
0094         if ($isFile) {
0095             $htmlData = file_get_contents($data);
0096         } else {
0097             $htmlData = $data;
0098         }
0099         @$this->_doc->loadHTML($htmlData);
0100
0101         if ($this->_doc->encoding === null) {
0102             // Document encoding is not recognized
0103
0104             /** @todo improve HTML vs HTML fragment recognition */
0105             if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
0106                 // It's an HTML document
0107                 // Add additional HEAD section and recognize document
0108                 $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
0109
0110                 @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
0111                                      . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
0112                                      . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
0113
0114                 // Remove additional HEAD section
0115                 $xpath = new DOMXPath($this->_doc);
0116                 $head  = $xpath->query('/html/head')->item(0);
0117                 $head->parentNode->removeChild($head);
0118             } else {
0119                 // It's an HTML fragment
0120                 @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
0121                                      . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
0122                                      . '</body></html>');
0123             }
0124
0125         }
0126         /** @todo Add correction of wrong HTML encoding recognition processing
0127          * The case is:
0128          * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
0129          * even $this->_doc->encoding demonstrates another recognized encoding
0130          */
0131
0132         $xpath = new DOMXPath($this->_doc);
0133
0134         $docTitle = '';
0135         $titleNodes = $xpath->query('/html/head/title');
0136         foreach ($titleNodes as $titleNode) {
0137             // title should always have only one entry, but we process all nodeset entries
0138             $docTitle .= $titleNode->nodeValue . ' ';
0139         }
0140         $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
0141
0142         $metaNodes = $xpath->query('/html/head/meta[@name]');
0143         foreach ($metaNodes as $metaNode) {
0144             $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
0145                                                            $metaNode->getAttribute('content'),
0146                                                            'UTF-8'));
0147         }
0148
0149         $docBody = '';
0150         $bodyNodes = $xpath->query('/html/body');
0151         foreach ($bodyNodes as $bodyNode) {
0152             // body should always have only one entry, but we process all nodeset entries
0153             $this->_retrieveNodeText($bodyNode, $docBody);
0154         }
0155         if ($storeContent) {
0156             $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8'));
0157         } else {
0158             $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8'));
0159         }
0160
0161         $linkNodes = $this->_doc->getElementsByTagName('a');
0162         foreach ($linkNodes as $linkNode) {
0163             if (($href = $linkNode->getAttribute('href')) != '' &&
0164                 (!self::$_excludeNoFollowLinks  ||  strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
0165                ) {
0166                 $this->_links[] = $href;
0167             }
0168         }
0169         $linkNodes = $this->_doc->getElementsByTagName('area');
0170         foreach ($linkNodes as $linkNode) {
0171             if (($href = $linkNode->getAttribute('href')) != '' &&
0172                 (!self::$_excludeNoFollowLinks  ||  strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
0173                ) {
0174                 $this->_links[] = $href;
0175             }
0176         }
0177         $this->_links = array_unique($this->_links);
0178
0179         $linkNodes = $xpath->query('/html/head/link');
0180         foreach ($linkNodes as $linkNode) {
0181             if (($href = $linkNode->getAttribute('href')) != '') {
0182                 $this->_headerLinks[] = $href;
0183             }
0184         }
0185         $this->_headerLinks = array_unique($this->_headerLinks);
0186     }
0187
0188     /**
0189      * Set exclude nofollow links flag
0190      *
0191      * @param boolean $newValue
0192      */
0193     public static function setExcludeNoFollowLinks($newValue)
0194     {
0195         self::$_excludeNoFollowLinks = $newValue;
0196     }
0197
0198     /**
0199      * Get exclude nofollow links flag
0200      *
0201      * @return boolean
0202      */
0203     public static function getExcludeNoFollowLinks()
0204     {
0205         return self::$_excludeNoFollowLinks;
0206     }
0207
0208     /**
0209      * Get node text
0210      *
0211      * We should exclude scripts, which may be not included into comment tags, CDATA sections,
0212      *
0213      * @param DOMNode $node
0214      * @param string &$text
0215      */
0216     private function _retrieveNodeText(DOMNode $node, &$text)
0217     {
0218         if ($node->nodeType == XML_TEXT_NODE) {
0219             $text .= $node->nodeValue;
0220             if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
0221                 $text .= ' ';
0222             }
0223         } else if ($node->nodeType == XML_ELEMENT_NODE  &&  $node->nodeName != 'script') {
0224             foreach ($node->childNodes as $childNode) {
0225                 $this->_retrieveNodeText($childNode, $text);
0226             }
0227         }
0228     }
0229
0230     /**
0231      * Get document HREF links
0232      *
0233      * @return array
0234      */
0235     public function getLinks()
0236     {
0237         return $this->_links;
0238     }
0239
0240     /**
0241      * Get document header links
0242      *
0243      * @return array
0244      */
0245     public function getHeaderLinks()
0246     {
0247         return $this->_headerLinks;
0248     }
0249
0250     /**
0251      * Load HTML document from a string
0252      *
0253      * @param string  $data
0254      * @param boolean $storeContent
0255      * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
0256      * @return Zend_Search_Lucene_Document_Html
0257      */
0258     public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
0259     {
0260         return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding);
0261     }
0262
0263     /**
0264      * Load HTML document from a file
0265      *
0266      * @param string  $file
0267      * @param boolean $storeContent
0268      * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
0269      * @return Zend_Search_Lucene_Document_Html
0270      */
0271     public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
0272     {
0273         return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding);
0274     }
0275
0276
0277     /**
0278      * Highlight text in text node
0279      *
0280      * @param DOMText $node
0281      * @param array   $wordsToHighlight
0282      * @param callback $callback   Callback method, used to transform (highlighting) text.
0283      * @param array    $params     Array of additionall callback parameters (first non-optional parameter is a text to transform)
0284      * @throws Zend_Search_Lucene_Exception
0285      */
0286     protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params)
0287     {
0288         /** Zend_Search_Lucene_Analysis_Analyzer */
0289         // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
0290
0291         $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
0292         $analyzer->setInput($node->nodeValue, 'UTF-8');
0293
0294         $matchedTokens = array();
0295
0296         while (($token = $analyzer->nextToken()) !== null) {
0297             if (isset($wordsToHighlight[$token->getTermText()])) {
0298                 $matchedTokens[] = $token;
0299             }
0300         }
0301
0302         if (count($matchedTokens) == 0) {
0303             return;
0304         }
0305
0306         $matchedTokens = array_reverse($matchedTokens);
0307
0308         foreach ($matchedTokens as $token) {
0309             // Cut text after matched token
0310             $node->splitText($token->getEndOffset());
0311
0312             // Cut matched node
0313             $matchedWordNode = $node->splitText($token->getStartOffset());
0314
0315             // Retrieve HTML string representation for highlihted word
0316             $fullCallbackparamsList = $params;
0317             array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
0318             $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList);
0319
0320             // Transform HTML string to a DOM representation and automatically transform retrieved string
0321             // into valid XHTML (It's automatically done by loadHTML() method)
0322             $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8');
0323             $success = @$highlightedWordNodeSetDomDocument->
0324                                 loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
0325                                        . $highlightedWordNodeSetHtml
0326                                        . '</body></html>');
0327             if (!$success) {
0328                 // require_once 'Zend/Search/Lucene/Exception.php';
0329                 throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'.");
0330             }
0331             $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument);
0332             $highlightedWordNodeSet      = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;
0333
0334             for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
0335                 $nodeToImport = $highlightedWordNodeSet->item($count);
0336                 $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
0337                                                 $matchedWordNode);
0338             }
0339
0340             $node->parentNode->removeChild($matchedWordNode);
0341         }
0342     }
0343
0344
0345     /**
0346      * highlight words in content of the specified node
0347      *
0348      * @param DOMNode $contextNode
0349      * @param array $wordsToHighlight
0350      * @param callback $callback   Callback method, used to transform (highlighting) text.
0351      * @param array    $params     Array of additionall callback parameters (first non-optional parameter is a text to transform)
0352      */
0353     protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params)
0354     {
0355         $textNodes = array();
0356
0357         if (!$contextNode->hasChildNodes()) {
0358             return;
0359         }
0360
0361         foreach ($contextNode->childNodes as $childNode) {
0362             if ($childNode->nodeType == XML_TEXT_NODE) {
0363                 // process node later to leave childNodes structure untouched
0364                 $textNodes[] = $childNode;
0365             } else {
0366                 // Process node if it's not a script node
0367                 if ($childNode->nodeName != 'script') {
0368                     $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
0369                 }
0370             }
0371         }
0372
0373         foreach ($textNodes as $textNode) {
0374             $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
0375         }
0376     }
0377
0378     /**
0379      * Standard callback method used to highlight words.
0380      *
0381      * @param  string  $stringToHighlight
0382      * @return string
0383      * @internal
0384      */
0385     public function applyColour($stringToHighlight, $colour)
0386     {
0387         return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
0388     }
0389
0390     /**
0391      * Highlight text with specified color
0392      *
0393      * @param string|array $words
0394      * @param string $colour
0395      * @return string
0396      */
0397     public function highlight($words, $colour = '#66ffff')
0398     {
0399         return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
0400     }
0401
0402
0403
0404     /**
0405      * Highlight text using specified View helper or callback function.
0406      *
0407      * @param string|array $words  Words to highlight. Words could be organized using the array or string.
0408      * @param callback $callback   Callback method, used to transform (highlighting) text.
0409      * @param array    $params     Array of additionall callback parameters passed through into it
0410      *                             (first non-optional parameter is an HTML fragment for highlighting)
0411      * @return string
0412      * @throws Zend_Search_Lucene_Exception
0413      */
0414     public function highlightExtended($words, $callback, $params = array())
0415     {
0416         /** Zend_Search_Lucene_Analysis_Analyzer */
0417         // require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
0418
0419         if (!is_array($words)) {
0420             $words = array($words);
0421         }
0422
0423         $wordsToHighlightList = array();
0424         $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
0425         foreach ($words as $wordString) {
0426             $wordsToHighlightList[] = $analyzer->tokenize($wordString);
0427         }
0428         $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
0429
0430         if (count($wordsToHighlight) == 0) {
0431             return $this->_doc->saveHTML();
0432         }
0433
0434         $wordsToHighlightFlipped = array();
0435         foreach ($wordsToHighlight as $id => $token) {
0436             $wordsToHighlightFlipped[$token->getTermText()] = $id;
0437         }
0438
0439         if (!is_callable($callback)) {
0440             // require_once 'Zend/Search/Lucene/Exception.php';
0441             throw new Zend_Search_Lucene_Exception('$viewHelper parameter must be a View Helper name, View Helper object or callback.');
0442         }
0443
0444         $xpath = new DOMXPath($this->_doc);
0445
0446         $matchedNodes = $xpath->query("/html/body");
0447         foreach ($matchedNodes as $matchedNode) {
0448             $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
0449         }
0450     }
0451
0452
0453     /**
0454      * Get HTML
0455      *
0456      * @return string
0457      */
0458     public function getHTML()
0459     {
0460         return $this->_doc->saveHTML();
0461     }
0462
0463     /**
0464      * Get HTML body
0465      *
0466      * @return string
0467      */
0468     public function getHtmlBody()
0469     {
0470         $xpath = new DOMXPath($this->_doc);
0471         $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;
0472
0473         $outputFragments = array();
0474         for ($count = 0; $count < $bodyNodes->length; $count++) {
0475             $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
0476         }
0477
0478         return implode($outputFragments);
0479     }
0480 }
0481