File indexing completed on 2024-05-19 03:56:45
0001 /* 0002 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in> 0003 0004 SPDX-License-Identifier: LGPL-2.1-or-later 0005 */ 0006 0007 0008 #include "office2007extractor.h" 0009 #include <memory> 0010 0011 #include <KZip> 0012 0013 #include <QDebug> 0014 #include <QDomDocument> 0015 #include <QXmlStreamReader> 0016 0017 using namespace KFileMetaData; 0018 0019 Office2007Extractor::Office2007Extractor(QObject* parent) 0020 : ExtractorPlugin(parent) 0021 { 0022 0023 } 0024 0025 const QStringList supportedMimeTypes = { 0026 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), 0027 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"), 0028 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"), 0029 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"), 0030 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"), 0031 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"), 0032 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), 0033 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"), 0034 }; 0035 0036 QStringList Office2007Extractor::mimetypes() const 0037 { 0038 return supportedMimeTypes; 0039 } 0040 0041 void Office2007Extractor::extract(ExtractionResult* result) 0042 { 0043 KZip zip(result->inputUrl()); 0044 if (!zip.open(QIODevice::ReadOnly)) { 0045 qWarning() << "Document is not a valid ZIP archive"; 0046 return; 0047 } 0048 0049 const KArchiveDirectory* rootDir = zip.directory(); 0050 if (!rootDir) { 0051 qWarning() << "Invalid document structure (main directory is missing)"; 0052 return; 0053 } 0054 0055 const QStringList rootEntries = rootDir->entries(); 0056 if (!rootEntries.contains(QStringLiteral("docProps"))) { 0057 qWarning() << "Invalid document structure (docProps is missing)"; 0058 return; 0059 } 0060 0061 const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps")); 0062 if (!docPropEntry->isDirectory()) { 0063 qWarning() << "Invalid document structure (docProps is not a directory)"; 0064 return; 0065 } 0066 0067 const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry); 0068 0069 const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData; 0070 0071 const KArchiveFile* file = docPropDirectory->file(QStringLiteral("core.xml")); 0072 if (extractMetaData && file) { 0073 QDomDocument coreDoc(QStringLiteral("core")); 0074 coreDoc.setContent(file->data()); 0075 0076 QDomElement docElem = coreDoc.documentElement(); 0077 0078 QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description")); 0079 if (!elem.isNull()) { 0080 QString str = elem.text(); 0081 if (!str.isEmpty()) { 0082 result->add(Property::Description, str); 0083 } 0084 } 0085 0086 elem = docElem.firstChildElement(QStringLiteral("dc:subject")); 0087 if (!elem.isNull()) { 0088 QString str = elem.text(); 0089 if (!str.isEmpty()) { 0090 result->add(Property::Subject, str); 0091 } 0092 } 0093 0094 elem = docElem.firstChildElement(QStringLiteral("dc:title")); 0095 if (!elem.isNull()) { 0096 QString str = elem.text(); 0097 if (!str.isEmpty()) { 0098 result->add(Property::Title, str); 0099 } 0100 } 0101 0102 elem = docElem.firstChildElement(QStringLiteral("dc:creator")); 0103 if (!elem.isNull()) { 0104 QString str = elem.text(); 0105 if (!str.isEmpty()) { 0106 result->add(Property::Author, str); 0107 } 0108 } 0109 0110 elem = docElem.firstChildElement(QStringLiteral("dc:language")); 0111 if (!elem.isNull()) { 0112 QString str = elem.text(); 0113 if (!str.isEmpty()) { 0114 result->add(Property::Language, str); 0115 } 0116 } 0117 0118 elem = docElem.firstChildElement(QStringLiteral("dcterms:created")); 0119 if (!elem.isNull()) { 0120 QString str = elem.text(); 0121 QDateTime dt = dateTimeFromString(str); 0122 if (!dt.isNull()) { 0123 result->add(Property::CreationDate, dt); 0124 } 0125 } 0126 0127 elem = docElem.firstChildElement(QStringLiteral("cp:keywords")); 0128 if (!elem.isNull()) { 0129 QString str = elem.text(); 0130 if (!str.isEmpty()) { 0131 result->add(Property::Keywords, str); 0132 } 0133 } 0134 } 0135 0136 file = docPropDirectory->file(QStringLiteral("app.xml")); 0137 if (extractMetaData && file) { 0138 QDomDocument appDoc(QStringLiteral("app")); 0139 appDoc.setContent(file->data()); 0140 0141 QDomElement docElem = appDoc.documentElement(); 0142 0143 const QString mimeType = result->inputMimetype(); 0144 if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { 0145 QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages")); 0146 if (!elem.isNull()) { 0147 bool ok = false; 0148 int pageCount = elem.text().toInt(&ok); 0149 if (ok) { 0150 result->add(Property::PageCount, pageCount); 0151 } 0152 } 0153 0154 elem = docElem.firstChildElement(QStringLiteral("Words")); 0155 if (!elem.isNull()) { 0156 bool ok = false; 0157 int wordCount = elem.text().toInt(&ok); 0158 if (ok) { 0159 result->add(Property::WordCount, wordCount); 0160 } 0161 } 0162 0163 elem = docElem.firstChildElement(QStringLiteral("Lines")); 0164 if (!elem.isNull()) { 0165 bool ok = false; 0166 int lineCount = elem.text().toInt(&ok); 0167 if (ok) { 0168 result->add(Property::LineCount, lineCount); 0169 } 0170 } 0171 } 0172 0173 QDomElement elem = docElem.firstChildElement(QStringLiteral("Application")); 0174 if (!elem.isNull()) { 0175 QString app = elem.text(); 0176 if (!app.isEmpty()) { 0177 result->add(Property::Generator, app); 0178 } 0179 } 0180 } 0181 0182 // 0183 // Plain Text 0184 // 0185 bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText); 0186 0187 if (rootEntries.contains(QStringLiteral("word"))) { 0188 result->addType(Type::Document); 0189 0190 if (!extractPlainText) { 0191 return; 0192 } 0193 0194 const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word")); 0195 if (!wordEntry->isDirectory()) { 0196 qWarning() << "Invalid document structure (word is not a directory)"; 0197 return; 0198 } 0199 0200 const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry); 0201 const QStringList wordEntries = wordDirectory->entries(); 0202 0203 if (wordEntries.contains(QStringLiteral("document.xml"))) { 0204 const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml")); 0205 0206 if (file) { 0207 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; 0208 extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result); 0209 } 0210 } 0211 } 0212 0213 else if (rootEntries.contains(QStringLiteral("xl"))) { 0214 result->addType(Type::Document); 0215 result->addType(Type::Spreadsheet); 0216 0217 if (!extractPlainText) { 0218 return; 0219 } 0220 0221 const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl")); 0222 if (!xlEntry->isDirectory()) { 0223 qWarning() << "Invalid document structure (xl is not a directory)"; 0224 return; 0225 } 0226 0227 const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry); 0228 // TODO: Read the sheets from worksheets/*.xml, and dereference all cells 0229 // values in order 0230 const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml")); 0231 if (!file) { 0232 return; 0233 } 0234 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; 0235 extractTextWithTag(contentIODevice.get(), QStringLiteral("t"), result); 0236 } 0237 0238 else if (rootEntries.contains(QStringLiteral("ppt"))) { 0239 result->addType(Type::Document); 0240 result->addType(Type::Presentation); 0241 0242 if (!extractPlainText) { 0243 return; 0244 } 0245 0246 const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt")); 0247 if (!pptEntry->isDirectory()) { 0248 qWarning() << "Invalid document structure (ppt is not a directory)"; 0249 return; 0250 } 0251 0252 const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry); 0253 const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides")); 0254 if (!slidesEntry || !slidesEntry->isDirectory()) { 0255 return; 0256 } 0257 0258 const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry); 0259 QStringList entries = slidesDirectory->entries(); 0260 // TODO: Read the actual order from presentation.xml, and follow the 0261 // references in ppt/_rels/presentation.xml.rel 0262 std::sort(entries.begin(), entries.end()); 0263 for (const QString & entryName : std::as_const(entries)) { 0264 const KArchiveFile* file = slidesDirectory->file(entryName); 0265 if (!file) { 0266 continue; 0267 } 0268 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; 0269 extractTextWithTag(contentIODevice.get(), QStringLiteral("a:t"), result); 0270 } 0271 } 0272 } 0273 0274 void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result) 0275 { 0276 QXmlStreamReader xml(device); 0277 0278 while (!xml.atEnd()) { 0279 xml.readNext(); 0280 if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) { 0281 QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements); 0282 0283 if (!str.isEmpty()) { 0284 result->append(str); 0285 } 0286 } 0287 0288 if (xml.isEndDocument() || xml.hasError()) { 0289 break; 0290 } 0291 } 0292 } 0293 0294 #include "moc_office2007extractor.cpp"