File indexing completed on 2025-01-19 05:21:25
0001 <?php 0002 /** 0003 * Zend Framework 0004 * 0005 * LICENSE 0006 * 0007 * This source file is subject to the new BSD license that is bundled 0008 * with this package in the file LICENSE.txt. 0009 * It is also available through the world-wide-web at this URL: 0010 * http://framework.zend.com/license/new-bsd 0011 * If you did not receive a copy of the license and are unable to 0012 * obtain it through the world-wide-web, please send an email 0013 * to license@zend.com so we can send you a copy immediately. 0014 * 0015 * @category Zend 0016 * @package Zend_Search_Lucene 0017 * @subpackage Document 0018 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0019 * @license http://framework.zend.com/license/new-bsd New BSD License 0020 * @version $Id$ 0021 */ 0022 0023 /** Zend_Search_Lucene_Document_OpenXml */ 0024 // require_once 'Zend/Search/Lucene/Document/OpenXml.php'; 0025 0026 /** Zend_Xml_Security */ 0027 // require_once 'Zend/Xml/Security.php'; 0028 0029 /** 0030 * Docx document. 0031 * 0032 * @category Zend 0033 * @package Zend_Search_Lucene 0034 * @subpackage Document 0035 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 0036 * @license http://framework.zend.com/license/new-bsd New BSD License 0037 */ 0038 class Zend_Search_Lucene_Document_Docx extends Zend_Search_Lucene_Document_OpenXml { 0039 /** 0040 * Xml Schema - WordprocessingML 0041 * 0042 * @var string 0043 */ 0044 const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'; 0045 0046 /** 0047 * Object constructor 0048 * 0049 * @param string $fileName 0050 * @param boolean $storeContent 0051 * @throws Zend_Search_Lucene_Exception 0052 */ 0053 private function __construct($fileName, $storeContent) { 0054 if (!class_exists('ZipArchive', false)) { 0055 // require_once 'Zend/Search/Lucene/Exception.php'; 0056 throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded'); 0057 } 0058 0059 // Document data holders 0060 $documentBody = array(); 0061 $coreProperties = array(); 0062 0063 // Open OpenXML package 0064 $package = new ZipArchive(); 0065 $package->open($fileName); 0066 0067 // Read relations and search for officeDocument 0068 $relationsXml = $package->getFromName('_rels/.rels'); 0069 if ($relationsXml === false) { 0070 // require_once 'Zend/Search/Lucene/Exception.php'; 0071 throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .docx file.'); 0072 } 0073 $relations = Zend_Xml_Security::scan($relationsXml); 0074 foreach($relations->Relationship as $rel) { 0075 if ($rel ["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) { 0076 // Found office document! Read in contents... 0077 $contents = Zend_Xml_Security::scan($package->getFromName( 0078 $this->absoluteZipPath(dirname($rel['Target']) 0079 . '/' 0080 . basename($rel['Target'])) 0081 )); 0082 0083 $contents->registerXPathNamespace('w', Zend_Search_Lucene_Document_Docx::SCHEMA_WORDPROCESSINGML); 0084 $paragraphs = $contents->xpath('//w:body/w:p'); 0085 0086 foreach ($paragraphs as $paragraph) { 0087 $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); 0088 0089 if ($runs === false) { 0090 // Paragraph doesn't contain any text or breaks 0091 continue; 0092 } 0093 0094 foreach ($runs as $run) { 0095 if ($run->getName() == 'br') { 0096 // Break element 0097 $documentBody[] = ' '; 0098 } else { 0099 $documentBody[] = (string)$run; 0100 } 0101 } 0102 0103 // Add space after each paragraph. So they are not bound together. 0104 $documentBody[] = ' '; 0105 } 0106 0107 break; 0108 } 0109 } 0110 0111 // Read core properties 0112 $coreProperties = $this->extractMetaData($package); 0113 0114 // Close file 0115 $package->close(); 0116 0117 // Store filename 0118 $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8')); 0119 0120 // Store contents 0121 if ($storeContent) { 0122 $this->addField(Zend_Search_Lucene_Field::Text('body', implode('', $documentBody), 'UTF-8')); 0123 } else { 0124 $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode('', $documentBody), 'UTF-8')); 0125 } 0126 0127 // Store meta data properties 0128 foreach ($coreProperties as $key => $value) { 0129 $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8')); 0130 } 0131 0132 // Store title (if not present in meta data) 0133 if (! isset($coreProperties['title'])) { 0134 $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8')); 0135 } 0136 } 0137 0138 /** 0139 * Load Docx document from a file 0140 * 0141 * @param string $fileName 0142 * @param boolean $storeContent 0143 * @return Zend_Search_Lucene_Document_Docx 0144 * @throws Zend_Search_Lucene_Document_Exception 0145 */ 0146 public static function loadDocxFile($fileName, $storeContent = false) { 0147 if (!is_readable($fileName)) { 0148 // require_once 'Zend/Search/Lucene/Document/Exception.php'; 0149 throw new Zend_Search_Lucene_Document_Exception('Provided file \'' . $fileName . '\' is not readable.'); 0150 } 0151 0152 return new Zend_Search_Lucene_Document_Docx($fileName, $storeContent); 0153 } 0154 }