File indexing completed on 2025-03-16 12:49:36
0001 /* 0002 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in> 0003 0004 SPDX-License-Identifier: LGPL-2.1-or-later 0005 */ 0006 0007 0008 #include "office2007extractor.h" 0009 #include <memory> 0010 0011 #include <KZip> 0012 0013 #include <QDebug> 0014 #include <QDomDocument> 0015 #include <QXmlStreamReader> 0016 0017 using namespace KFileMetaData; 0018 0019 Office2007Extractor::Office2007Extractor(QObject* parent) 0020 : ExtractorPlugin(parent) 0021 { 0022 0023 } 0024 0025 const QStringList supportedMimeTypes = { 0026 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), 0027 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"), 0028 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"), 0029 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"), 0030 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"), 0031 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"), 0032 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), 0033 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"), 0034 }; 0035 0036 QStringList Office2007Extractor::mimetypes() const 0037 { 0038 return supportedMimeTypes; 0039 } 0040 0041 void Office2007Extractor::extract(ExtractionResult* result) 0042 { 0043 KZip zip(result->inputUrl()); 0044 if (!zip.open(QIODevice::ReadOnly)) { 0045 qWarning() << "Document is not a valid ZIP archive"; 0046 return; 0047 } 0048 0049 const KArchiveDirectory* rootDir = zip.directory(); 0050 if (!rootDir) { 0051 qWarning() << "Invalid document structure (main directory is missing)"; 0052 return; 0053 } 0054 0055 const QStringList rootEntries = rootDir->entries(); 0056 if (!rootEntries.contains(QStringLiteral("docProps"))) { 0057 qWarning() << "Invalid document structure (docProps is missing)"; 0058 return; 0059 } 0060 0061 const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps")); 0062 if (!docPropEntry->isDirectory()) { 0063 qWarning() << "Invalid document structure (docProps is not a directory)"; 0064 return; 0065 } 0066 0067 const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry); 0068 0069 const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData; 0070 0071 const KArchiveFile* file = docPropDirectory->file(QStringLiteral("core.xml")); 0072 if (extractMetaData && file) { 0073 QDomDocument coreDoc(QStringLiteral("core")); 0074 coreDoc.setContent(file->data()); 0075 0076 QDomElement docElem = coreDoc.documentElement(); 0077 0078 QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description")); 0079 if (!elem.isNull()) { 0080 QString str = elem.text(); 0081 if (!str.isEmpty()) { 0082 result->add(Property::Description, str); 0083 } 0084 } 0085 0086 elem = docElem.firstChildElement(QStringLiteral("dc:subject")); 0087 if (!elem.isNull()) { 0088 QString str = elem.text(); 0089 if (!str.isEmpty()) { 0090 result->add(Property::Subject, str); 0091 } 0092 } 0093 0094 elem = docElem.firstChildElement(QStringLiteral("dc:title")); 0095 if (!elem.isNull()) { 0096 QString str = elem.text(); 0097 if (!str.isEmpty()) { 0098 result->add(Property::Title, str); 0099 } 0100 } 0101 0102 elem = docElem.firstChildElement(QStringLiteral("dc:creator")); 0103 if (!elem.isNull()) { 0104 QString str = elem.text(); 0105 if (!str.isEmpty()) { 0106 result->add(Property::Author, str); 0107 } 0108 } 0109 0110 elem = docElem.firstChildElement(QStringLiteral("dc:language")); 0111 if (!elem.isNull()) { 0112 QString str = elem.text(); 0113 if (!str.isEmpty()) { 0114 result->add(Property::Language, str); 0115 } 0116 } 0117 0118 elem = docElem.firstChildElement(QStringLiteral("dcterms:created")); 0119 if (!elem.isNull()) { 0120 QString str = elem.text(); 0121 QDateTime dt = dateTimeFromString(str); 0122 if (!dt.isNull()) { 0123 result->add(Property::CreationDate, dt); 0124 } 0125 } 0126 0127 elem = docElem.firstChildElement(QStringLiteral("cp:keywords")); 0128 if (!elem.isNull()) { 0129 QString str = elem.text(); 0130 if (!str.isEmpty()) { 0131 result->add(Property::Keywords, str); 0132 } 0133 } 0134 } 0135 0136 file = docPropDirectory->file(QStringLiteral("app.xml")); 0137 if (extractMetaData && file) { 0138 QDomDocument appDoc(QStringLiteral("app")); 0139 appDoc.setContent(file->data()); 0140 0141 QDomElement docElem = appDoc.documentElement(); 0142 0143 const QString mimeType = result->inputMimetype(); 0144 if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { 0145 QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages")); 0146 if (!elem.isNull()) { 0147 bool ok = false; 0148 int pageCount = elem.text().toInt(&ok); 0149 if (ok) { 0150 result->add(Property::PageCount, pageCount); 0151 } 0152 } 0153 0154 elem = docElem.firstChildElement(QStringLiteral("Words")); 0155 if (!elem.isNull()) { 0156 bool ok = false; 0157 int wordCount = elem.text().toInt(&ok); 0158 if (ok) { 0159 result->add(Property::WordCount, wordCount); 0160 } 0161 } 0162 0163 elem = docElem.firstChildElement(QStringLiteral("Lines")); 0164 if (!elem.isNull()) { 0165 bool ok = false; 0166 int lineCount = elem.text().toInt(&ok); 0167 if (ok) { 0168 result->add(Property::LineCount, lineCount); 0169 } 0170 } 0171 } 0172 0173 QDomElement elem = docElem.firstChildElement(QStringLiteral("Application")); 0174 if (!elem.isNull()) { 0175 QString app = elem.text(); 0176 if (!app.isEmpty()) { 0177 result->add(Property::Generator, app); 0178 } 0179 } 0180 } 0181 0182 // 0183 // Plain Text 0184 // 0185 bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText); 0186 0187 if (rootEntries.contains(QStringLiteral("word"))) { 0188 result->addType(Type::Document); 0189 0190 if (!extractPlainText) { 0191 return; 0192 } 0193 0194 const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word")); 0195 if (!wordEntry->isDirectory()) { 0196 qWarning() << "Invalid document structure (word is not a directory)"; 0197 return; 0198 } 0199 0200 const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry); 0201 const QStringList wordEntries = wordDirectory->entries(); 0202 0203 if (wordEntries.contains(QStringLiteral("document.xml"))) { 0204 const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml")); 0205 0206 if (file) { 0207 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; 0208 extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result); 0209 } 0210 } 0211 } 0212 0213 else if (rootEntries.contains(QStringLiteral("xl"))) { 0214 result->addType(Type::Document); 0215 result->addType(Type::Spreadsheet); 0216 0217 if (!extractPlainText) { 0218 return; 0219 } 0220 0221 const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl")); 0222 if (!xlEntry->isDirectory()) { 0223 qWarning() << "Invalid document structure (xl is not a directory)"; 0224 return; 0225 } 0226 0227 const KArchiveDirectory* xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry); 0228 extractTextFromFiles(xlDirectory, result); 0229 } 0230 0231 else if (rootEntries.contains(QStringLiteral("ppt"))) { 0232 result->addType(Type::Document); 0233 result->addType(Type::Presentation); 0234 0235 if (!extractPlainText) { 0236 return; 0237 } 0238 0239 const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt")); 0240 if (!pptEntry->isDirectory()) { 0241 qWarning() << "Invalid document structure (ppt is not a directory)"; 0242 return; 0243 } 0244 0245 const KArchiveDirectory* pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry); 0246 extractTextFromFiles(pptDirectory, result); 0247 } 0248 } 0249 0250 void Office2007Extractor::extractAllText(QIODevice* device, ExtractionResult* result) 0251 { 0252 QXmlStreamReader xml(device); 0253 0254 while (!xml.atEnd()) { 0255 xml.readNext(); 0256 if (xml.isCharacters()) { 0257 QString str = xml.text().toString(); 0258 result->append(str); 0259 } 0260 0261 if (xml.isEndDocument() || xml.hasError()) { 0262 break; 0263 } 0264 } 0265 } 0266 0267 void Office2007Extractor::extractTextFromFiles(const KArchiveDirectory* archiveDir, ExtractionResult* result) 0268 { 0269 const QStringList entries = archiveDir->entries(); 0270 for (const QString & entryName : entries) { 0271 const KArchiveEntry* entry = archiveDir->entry(entryName); 0272 if (!entry) { 0273 continue; 0274 } 0275 if (entry->isDirectory()) { 0276 const KArchiveDirectory* subDir = dynamic_cast<const KArchiveDirectory*>(entry); 0277 extractTextFromFiles(subDir, result); 0278 continue; 0279 } 0280 0281 if (entry->isFile() && entryName.endsWith(QLatin1String(".xml"))) { 0282 const KArchiveFile* file = static_cast<const KArchiveFile*>(entry); 0283 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; 0284 extractAllText(contentIODevice.get() , result); 0285 } 0286 } 0287 } 0288 0289 void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result) 0290 { 0291 QXmlStreamReader xml(device); 0292 0293 while (!xml.atEnd()) { 0294 xml.readNext(); 0295 if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) { 0296 QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements); 0297 0298 if (!str.isEmpty()) { 0299 result->append(str); 0300 } 0301 } 0302 0303 if (xml.isEndDocument() || xml.hasError()) { 0304 break; 0305 } 0306 } 0307 } 0308 0309 #include "moc_office2007extractor.cpp"