File indexing completed on 2024-05-12 15:37:05

0001 /*
0002     SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
0003 
0004     SPDX-License-Identifier: LGPL-2.1-or-later
0005 */
0006 
0007 
0008 #include "office2007extractor.h"
0009 #include <memory>
0010 
0011 #include <KZip>
0012 
0013 #include <QDebug>
0014 #include <QDomDocument>
0015 #include <QXmlStreamReader>
0016 
0017 using namespace KFileMetaData;
0018 
0019 Office2007Extractor::Office2007Extractor(QObject* parent)
0020     : ExtractorPlugin(parent)
0021 {
0022 
0023 }
0024 
0025 const QStringList supportedMimeTypes = {
0026     QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
0027     QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
0028     QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
0029     QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"),
0030     QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
0031     QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"),
0032     QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
0033     QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
0034 };
0035 
0036 QStringList Office2007Extractor::mimetypes() const
0037 {
0038     return supportedMimeTypes;
0039 }
0040 
0041 void Office2007Extractor::extract(ExtractionResult* result)
0042 {
0043     KZip zip(result->inputUrl());
0044     if (!zip.open(QIODevice::ReadOnly)) {
0045         qWarning() << "Document is not a valid ZIP archive";
0046         return;
0047     }
0048 
0049     const KArchiveDirectory* rootDir = zip.directory();
0050     if (!rootDir) {
0051         qWarning() << "Invalid document structure (main directory is missing)";
0052         return;
0053     }
0054 
0055     const QStringList rootEntries = rootDir->entries();
0056     if (!rootEntries.contains(QStringLiteral("docProps"))) {
0057         qWarning() << "Invalid document structure (docProps is missing)";
0058         return;
0059     }
0060 
0061     const KArchiveEntry* docPropEntry = rootDir->entry(QStringLiteral("docProps"));
0062     if (!docPropEntry->isDirectory()) {
0063         qWarning() << "Invalid document structure (docProps is not a directory)";
0064         return;
0065     }
0066 
0067     const KArchiveDirectory* docPropDirectory = dynamic_cast<const KArchiveDirectory*>(docPropEntry);
0068 
0069     const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
0070 
0071     const KArchiveFile* file = docPropDirectory->file(QStringLiteral("core.xml"));
0072     if (extractMetaData && file) {
0073         QDomDocument coreDoc(QStringLiteral("core"));
0074         coreDoc.setContent(file->data());
0075 
0076         QDomElement docElem = coreDoc.documentElement();
0077 
0078         QDomElement elem = docElem.firstChildElement(QStringLiteral("dc:description"));
0079         if (!elem.isNull()) {
0080             QString str = elem.text();
0081             if (!str.isEmpty()) {
0082                 result->add(Property::Description, str);
0083             }
0084         }
0085 
0086         elem = docElem.firstChildElement(QStringLiteral("dc:subject"));
0087         if (!elem.isNull()) {
0088             QString str = elem.text();
0089             if (!str.isEmpty()) {
0090                 result->add(Property::Subject, str);
0091             }
0092         }
0093 
0094         elem = docElem.firstChildElement(QStringLiteral("dc:title"));
0095         if (!elem.isNull()) {
0096             QString str = elem.text();
0097             if (!str.isEmpty()) {
0098                 result->add(Property::Title, str);
0099             }
0100         }
0101 
0102         elem = docElem.firstChildElement(QStringLiteral("dc:creator"));
0103         if (!elem.isNull()) {
0104             QString str = elem.text();
0105             if (!str.isEmpty()) {
0106                 result->add(Property::Author, str);
0107             }
0108         }
0109 
0110         elem = docElem.firstChildElement(QStringLiteral("dc:language"));
0111         if (!elem.isNull()) {
0112             QString str = elem.text();
0113             if (!str.isEmpty()) {
0114                 result->add(Property::Language, str);
0115             }
0116         }
0117 
0118         elem = docElem.firstChildElement(QStringLiteral("dcterms:created"));
0119         if (!elem.isNull()) {
0120             QString str = elem.text();
0121             QDateTime dt = dateTimeFromString(str);
0122             if (!dt.isNull()) {
0123                 result->add(Property::CreationDate, dt);
0124             }
0125         }
0126 
0127         elem = docElem.firstChildElement(QStringLiteral("cp:keywords"));
0128         if (!elem.isNull()) {
0129             QString str = elem.text();
0130             if (!str.isEmpty()) {
0131                 result->add(Property::Keywords, str);
0132             }
0133         }
0134     }
0135 
0136     file = docPropDirectory->file(QStringLiteral("app.xml"));
0137     if (extractMetaData && file) {
0138         QDomDocument appDoc(QStringLiteral("app"));
0139         appDoc.setContent(file->data());
0140 
0141         QDomElement docElem = appDoc.documentElement();
0142 
0143         const QString mimeType = result->inputMimetype();
0144         if (mimeType == QLatin1String("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
0145             QDomElement elem = docElem.firstChildElement(QStringLiteral("Pages"));
0146             if (!elem.isNull()) {
0147                 bool ok = false;
0148                 int pageCount = elem.text().toInt(&ok);
0149                 if (ok) {
0150                     result->add(Property::PageCount, pageCount);
0151                 }
0152             }
0153 
0154             elem = docElem.firstChildElement(QStringLiteral("Words"));
0155             if (!elem.isNull()) {
0156                 bool ok = false;
0157                 int wordCount = elem.text().toInt(&ok);
0158                 if (ok) {
0159                     result->add(Property::WordCount, wordCount);
0160                 }
0161             }
0162 
0163             elem = docElem.firstChildElement(QStringLiteral("Lines"));
0164             if (!elem.isNull()) {
0165                 bool ok = false;
0166                 int lineCount = elem.text().toInt(&ok);
0167                 if (ok) {
0168                     result->add(Property::LineCount, lineCount);
0169                 }
0170             }
0171         }
0172 
0173         QDomElement elem = docElem.firstChildElement(QStringLiteral("Application"));
0174         if (!elem.isNull()) {
0175             QString app = elem.text();
0176             if (!app.isEmpty()) {
0177                 result->add(Property::Generator, app);
0178             }
0179         }
0180     }
0181 
0182     //
0183     // Plain Text
0184     //
0185     bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
0186 
0187     if (rootEntries.contains(QStringLiteral("word"))) {
0188         result->addType(Type::Document);
0189 
0190         if (!extractPlainText) {
0191             return;
0192         }
0193 
0194         const KArchiveEntry* wordEntry = rootDir->entry(QStringLiteral("word"));
0195         if (!wordEntry->isDirectory()) {
0196             qWarning() << "Invalid document structure (word is not a directory)";
0197             return;
0198         }
0199 
0200         const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
0201         const QStringList wordEntries = wordDirectory->entries();
0202 
0203         if (wordEntries.contains(QStringLiteral("document.xml"))) {
0204             const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml"));
0205 
0206             if (file) {
0207                 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
0208                 extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result);
0209             }
0210         }
0211     }
0212 
0213     else if (rootEntries.contains(QStringLiteral("xl"))) {
0214         result->addType(Type::Document);
0215         result->addType(Type::Spreadsheet);
0216 
0217         if (!extractPlainText) {
0218             return;
0219         }
0220 
0221         const KArchiveEntry* xlEntry = rootDir->entry(QStringLiteral("xl"));
0222         if (!xlEntry->isDirectory()) {
0223             qWarning() << "Invalid document structure (xl is not a directory)";
0224             return;
0225         }
0226 
0227         const KArchiveDirectory* xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
0228         extractTextFromFiles(xlDirectory, result);
0229     }
0230 
0231     else if (rootEntries.contains(QStringLiteral("ppt"))) {
0232         result->addType(Type::Document);
0233         result->addType(Type::Presentation);
0234 
0235         if (!extractPlainText) {
0236             return;
0237         }
0238 
0239         const KArchiveEntry* pptEntry = rootDir->entry(QStringLiteral("ppt"));
0240         if (!pptEntry->isDirectory()) {
0241             qWarning() << "Invalid document structure (ppt is not a directory)";
0242             return;
0243         }
0244 
0245         const KArchiveDirectory* pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
0246         extractTextFromFiles(pptDirectory, result);
0247     }
0248 }
0249 
0250 void Office2007Extractor::extractAllText(QIODevice* device, ExtractionResult* result)
0251 {
0252     QXmlStreamReader xml(device);
0253 
0254     while (!xml.atEnd()) {
0255         xml.readNext();
0256         if (xml.isCharacters()) {
0257             QString str = xml.text().toString();
0258             result->append(str);
0259         }
0260 
0261         if (xml.isEndDocument() || xml.hasError()) {
0262             break;
0263         }
0264     }
0265 }
0266 
0267 void Office2007Extractor::extractTextFromFiles(const KArchiveDirectory* archiveDir, ExtractionResult* result)
0268 {
0269     const QStringList entries = archiveDir->entries();
0270     for (const QString & entryName : entries) {
0271         const KArchiveEntry* entry = archiveDir->entry(entryName);
0272         if (!entry) {
0273             continue;
0274         }
0275         if (entry->isDirectory()) {
0276             const KArchiveDirectory* subDir = dynamic_cast<const KArchiveDirectory*>(entry);
0277             extractTextFromFiles(subDir, result);
0278             continue;
0279         }
0280 
0281         if (entry->isFile() && entryName.endsWith(QLatin1String(".xml"))) {
0282             const KArchiveFile* file = static_cast<const KArchiveFile*>(entry);
0283             std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
0284             extractAllText(contentIODevice.get() , result);
0285         }
0286     }
0287 }
0288 
0289 void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
0290 {
0291     QXmlStreamReader xml(device);
0292 
0293     while (!xml.atEnd()) {
0294         xml.readNext();
0295         if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
0296             QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements);
0297 
0298             if (!str.isEmpty()) {
0299                 result->append(str);
0300             }
0301         }
0302 
0303         if (xml.isEndDocument() || xml.hasError()) {
0304             break;
0305         }
0306     }
0307 }
0308 
0309 #include "moc_office2007extractor.cpp"