File indexing completed on 2025-01-19 05:21:25

0001 <?php
0002 /**
0003  * Zend Framework
0004  *
0005  * LICENSE
0006  *
0007  * This source file is subject to the new BSD license that is bundled
0008  * with this package in the file LICENSE.txt.
0009  * It is also available through the world-wide-web at this URL:
0010  * http://framework.zend.com/license/new-bsd
0011  * If you did not receive a copy of the license and are unable to
0012  * obtain it through the world-wide-web, please send an email
0013  * to license@zend.com so we can send you a copy immediately.
0014  *
0015  * @category   Zend
0016  * @package    Zend_Search_Lucene
0017  * @subpackage Document
0018  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0019  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0020  * @version    $Id$
0021  */
0022 
0023 
0024 /** Zend_Search_Lucene_Document_OpenXml */
0025 // require_once 'Zend/Search/Lucene/Document/OpenXml.php';
0026 
0027 /** Zend_Xml_Security */
0028 // require_once 'Zend/Xml/Security.php';
0029 
0030 /**
0031  * Xlsx document.
0032  *
0033  * @category   Zend
0034  * @package    Zend_Search_Lucene
0035  * @subpackage Document
0036  * @copyright  Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
0037  * @license    http://framework.zend.com/license/new-bsd     New BSD License
0038  */
0039 class Zend_Search_Lucene_Document_Xlsx extends Zend_Search_Lucene_Document_OpenXml
0040 {
0041     /**
0042      * Xml Schema - SpreadsheetML
0043      *
0044      * @var string
0045      */
0046     const SCHEMA_SPREADSHEETML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
0047 
0048     /**
0049      * Xml Schema - DrawingML
0050      *
0051      * @var string
0052      */
0053     const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
0054 
0055     /**
0056      * Xml Schema - Shared Strings
0057      *
0058      * @var string
0059      */
0060     const SCHEMA_SHAREDSTRINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings';
0061 
0062     /**
0063      * Xml Schema - Worksheet relation
0064      *
0065      * @var string
0066      */
0067     const SCHEMA_WORKSHEETRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet';
0068 
0069     /**
0070      * Xml Schema - Slide notes relation
0071      *
0072      * @var string
0073      */
0074     const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
0075 
0076     /**
0077      * Object constructor
0078      *
0079      * @param string  $fileName
0080      * @param boolean $storeContent
0081      * @throws Zend_Search_Lucene_Exception
0082      */
0083     private function __construct($fileName, $storeContent)
0084     {
0085         if (!class_exists('ZipArchive', false)) {
0086             // require_once 'Zend/Search/Lucene/Exception.php';
0087             throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
0088         }
0089 
0090         // Document data holders
0091         $sharedStrings = array();
0092         $worksheets = array();
0093         $documentBody = array();
0094         $coreProperties = array();
0095 
0096         // Open OpenXML package
0097         $package = new ZipArchive();
0098         $package->open($fileName);
0099 
0100         // Read relations and search for officeDocument
0101         $relationsXml = $package->getFromName('_rels/.rels');
0102         if ($relationsXml === false) {
0103             // require_once 'Zend/Search/Lucene/Exception.php';
0104             throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .xlsx file.');
0105         }
0106         $relations = Zend_Xml_Security::scan($relationsXml);
0107         foreach ($relations->Relationship as $rel) {
0108             if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
0109                 // Found office document! Read relations for workbook...
0110                 $workbookRelations = Zend_Xml_Security::scan($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
0111                 $workbookRelations->registerXPathNamespace("rel", Zend_Search_Lucene_Document_OpenXml::SCHEMA_RELATIONSHIP);
0112 
0113                 // Read shared strings
0114                 $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . Zend_Search_Lucene_Document_Xlsx::SCHEMA_SHAREDSTRINGS . "']");
0115                 $sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
0116                 $xmlStrings = Zend_Xml_Security::scan($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
0117                 if (isset($xmlStrings) && isset($xmlStrings->si)) {
0118                     foreach ($xmlStrings->si as $val) {
0119                         if (isset($val->t)) {
0120                             $sharedStrings[] = (string)$val->t;
0121                         } elseif (isset($val->r)) {
0122                             $sharedStrings[] = $this->_parseRichText($val);
0123                         }
0124                     }
0125                 }
0126 
0127                 // Loop relations for workbook and extract worksheets...
0128                 foreach ($workbookRelations->Relationship as $workbookRelation) {
0129                     if ($workbookRelation["Type"] == Zend_Search_Lucene_Document_Xlsx::SCHEMA_WORKSHEETRELATION) {
0130                         $worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = Zend_Xml_Security::scan(
0131                             $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
0132                         );
0133                     }
0134                 }
0135 
0136                 break;
0137             }
0138         }
0139 
0140         // Sort worksheets
0141         ksort($worksheets);
0142 
0143         // Extract contents from worksheets
0144         foreach ($worksheets as $sheetKey => $worksheet) {
0145             foreach ($worksheet->sheetData->row as $row) {
0146                 foreach ($row->c as $c) {
0147                     // Determine data type
0148                     $dataType = (string)$c["t"];
0149                     switch ($dataType) {
0150                         case "s":
0151                             // Value is a shared string
0152                             if ((string)$c->v != '') {
0153                                 $value = $sharedStrings[intval($c->v)];
0154                             } else {
0155                                 $value = '';
0156                             }
0157 
0158                             break;
0159 
0160                         case "b":
0161                             // Value is boolean
0162                             $value = (string)$c->v;
0163                             if ($value == '0') {
0164                                 $value = false;
0165                             } else if ($value == '1') {
0166                                 $value = true;
0167                             } else {
0168                                 $value = (bool)$c->v;
0169                             }
0170 
0171                             break;
0172 
0173                         case "inlineStr":
0174                             // Value is rich text inline
0175                             $value = $this->_parseRichText($c->is);
0176 
0177                             break;
0178 
0179                         case "e":
0180                             // Value is an error message
0181                             if ((string)$c->v != '') {
0182                                 $value = (string)$c->v;
0183                             } else {
0184                                 $value = '';
0185                             }
0186 
0187                             break;
0188 
0189                         default:
0190                             // Value is a string
0191                             $value = (string)$c->v;
0192 
0193                             // Check for numeric values
0194                             if (is_numeric($value) && $dataType != 's') {
0195                                 if ($value == (int)$value) $value = (int)$value;
0196                                 elseif ($value == (float)$value) $value = (float)$value;
0197                                 elseif ($value == (double)$value) $value = (double)$value;
0198                             }
0199                     }
0200 
0201                     $documentBody[] = $value;
0202                 }
0203             }
0204         }
0205 
0206         // Read core properties
0207         $coreProperties = $this->extractMetaData($package);
0208 
0209         // Close file
0210         $package->close();
0211 
0212         // Store filename
0213         $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
0214 
0215         // Store contents
0216         if ($storeContent) {
0217             $this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
0218         } else {
0219             $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
0220         }
0221 
0222         // Store meta data properties
0223         foreach ($coreProperties as $key => $value)
0224         {
0225             $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
0226         }
0227 
0228         // Store title (if not present in meta data)
0229         if (!isset($coreProperties['title']))
0230         {
0231             $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
0232         }
0233     }
0234 
0235     /**
0236      * Parse rich text XML
0237      *
0238      * @param SimpleXMLElement $is
0239      * @return string
0240      */
0241     private function _parseRichText($is = null) {
0242         $value = array();
0243 
0244         if (isset($is->t)) {
0245             $value[] = (string)$is->t;
0246         } else {
0247             foreach ($is->r as $run) {
0248                 $value[] = (string)$run->t;
0249             }
0250         }
0251 
0252         return implode('', $value);
0253     }
0254 
0255     /**
0256      * Load Xlsx document from a file
0257      *
0258      * @param string  $fileName
0259      * @param boolean $storeContent
0260      * @return Zend_Search_Lucene_Document_Xlsx
0261      */
0262     public static function loadXlsxFile($fileName, $storeContent = false)
0263     {
0264         return new Zend_Search_Lucene_Document_Xlsx($fileName, $storeContent);
0265     }
0266 }