File indexing completed on 2024-05-12 15:37:05

0001 /*
0002     SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
0003     SPDX-FileCopyrightText: 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de>
0004     SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
0005 
0006     SPDX-License-Identifier: LGPL-2.1-or-later
0007 */
0008 
0009 
0010 #include "odfextractor.h"
0011 #include <memory>
0012 
0013 #include <KZip>
0014 
0015 #include <QDebug>
0016 #include <QDomDocument>
0017 #include <QFile>
0018 #include <QXmlStreamReader>
0019 
0020 namespace {
0021 
0022 inline QString dcNS()     { return QStringLiteral("http://purl.org/dc/elements/1.1/"); }
0023 inline QString metaNS()   { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); }
0024 inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); }
0025 inline QString bodyTag()  { return QStringLiteral("body"); }
0026 
0027 QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName)
0028 {
0029     for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) {
0030         if (e.localName() == localName && e.namespaceURI() == nsURI) {
0031             return e;
0032         }
0033     }
0034 
0035     return QDomElement();
0036 }
0037 
0038 const QStringList supportedMimeTypes = {
0039     QStringLiteral("application/vnd.oasis.opendocument.text"),
0040     QStringLiteral("application/vnd.oasis.opendocument.text-template"),
0041     QStringLiteral("application/vnd.oasis.opendocument.text-master"),
0042     QStringLiteral("application/vnd.oasis.opendocument.text-master-template"),
0043     QStringLiteral("application/vnd.oasis.opendocument.text-flat-xml"),
0044     QStringLiteral("application/vnd.oasis.opendocument.presentation"),
0045     QStringLiteral("application/vnd.oasis.opendocument.presentation-template"),
0046     QStringLiteral("application/vnd.oasis.opendocument.presentation-flat-xml"),
0047     QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"),
0048     QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-template"),
0049     QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-flat-xml"),
0050     QStringLiteral("application/vnd.oasis.opendocument.graphics"),
0051     QStringLiteral("application/vnd.oasis.opendocument.graphics-template"),
0052     QStringLiteral("application/vnd.oasis.opendocument.graphics-flat-xml"),
0053 };
0054 
0055 }
0056 
0057 using namespace KFileMetaData;
0058 
0059 OdfExtractor::OdfExtractor(QObject* parent)
0060     : ExtractorPlugin(parent)
0061 {
0062 
0063 }
0064 
0065 QStringList OdfExtractor::mimetypes() const
0066 {
0067     return supportedMimeTypes;
0068 }
0069 
0070 void OdfExtractor::extract(ExtractionResult* result)
0071 {
0072     if (result->inputMimetype().endsWith(QLatin1String("-flat-xml"))) {
0073         QFile file(result->inputUrl());
0074         if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
0075             return;
0076         }
0077 
0078         result->addType(Type::Document);
0079         if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-flat-xml")) {
0080             result->addType(Type::Presentation);
0081         } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-flat-xml")) {
0082             result->addType(Type::Spreadsheet);
0083         } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-flat-xml")) {
0084             result->addType(Type::Image);
0085         }
0086 
0087         if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
0088             parseMetaData(QStringLiteral("document"), file.readAll(), result);
0089         }
0090 
0091         if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
0092             file.seek(0);
0093             extractPlainText(&file, result);
0094         }
0095 
0096         return;
0097     }
0098 
0099     KZip zip(result->inputUrl());
0100     if (!zip.open(QIODevice::ReadOnly)) {
0101         qWarning() << "Document is not a valid ZIP archive";
0102         return;
0103     }
0104 
0105     const KArchiveDirectory* directory = zip.directory();
0106     if (!directory) {
0107         qWarning() << "Invalid document structure (main directory is missing)";
0108         return;
0109     }
0110 
0111     // we need a meta xml file in the archive!
0112     const auto metaXml = directory->file(QStringLiteral("meta.xml"));
0113     if (!metaXml) {
0114         qWarning() << "Invalid document structure (meta.xml is missing)";
0115         return;
0116     }
0117 
0118     if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
0119         parseMetaData(QStringLiteral("document-meta"), metaXml->data(), result);
0120     }
0121 
0122     result->addType(Type::Document);
0123     if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) ||
0124         (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-template"))) {
0125         result->addType(Type::Presentation);
0126     }
0127     else if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) ||
0128              (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-template"))) {
0129         result->addType(Type::Spreadsheet);
0130     }
0131     else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics") ||
0132              result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-template")) {
0133         result->addType(Type::Image);
0134     }
0135 
0136     if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
0137         return;
0138     }
0139 
0140     // for content indexing, we need content xml file
0141     const auto contentXml = directory->file(QStringLiteral("content.xml"));
0142     if (!contentXml) {
0143         qWarning() << "Invalid document structure (content.xml is missing)";
0144         return;
0145     }
0146 
0147     std::unique_ptr<QIODevice> contentIODevice{contentXml->createDevice()};
0148     extractPlainText(contentIODevice.get(), result);
0149 }
0150 
0151 void OdfExtractor::parseMetaData(const QString &documentElementId, const QByteArray &data, ExtractionResult *result)
0152 {
0153     QDomDocument metaData(QStringLiteral("metaData"));
0154     metaData.setContent(data, true);
0155 
0156     // parse metadata ...
0157     QDomElement meta = firstChildElementNS(firstChildElementNS(metaData,
0158                                                                officeNS(), documentElementId),
0159                                            officeNS(), QStringLiteral("meta"));
0160 
0161     QDomNode n = meta.firstChild();
0162     while (!n.isNull()) {
0163         QDomElement e = n.toElement();
0164         if (!e.isNull()) {
0165             const QString namespaceURI = e.namespaceURI();
0166             const QString localName = e.localName();
0167 
0168             // Dublin Core
0169             if (namespaceURI == dcNS()) {
0170                 if (localName == QLatin1String("description")) {
0171                     result->add(Property::Description, e.text());
0172                 } else if (localName == QLatin1String("subject")) {
0173                     result->add(Property::Subject, e.text());
0174                 } else if (localName == QLatin1String("title")) {
0175                     result->add(Property::Title, e.text());
0176                 } else if (localName == QLatin1String("creator")) {
0177                     result->add(Property::Author, e.text());
0178                 } else if (localName == QLatin1String("language")) {
0179                     result->add(Property::Language, e.text());
0180                 }
0181             }
0182             // Meta Properties
0183             else if (namespaceURI == metaNS()) {
0184                 if (localName == QLatin1String("document-statistic")) {
0185                     bool ok = false;
0186                     int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok);
0187                     if (ok) {
0188                         result->add(Property::PageCount, pageCount);
0189                     }
0190 
0191                     int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok);
0192                     if (ok) {
0193                         result->add(Property::WordCount, wordCount);
0194                     }
0195                 } else if (localName == QLatin1String("keyword")) {
0196                     QString keywords = e.text();
0197                     result->add(Property::Keywords, keywords);
0198                 } else if (localName == QLatin1String("generator")) {
0199                     result->add(Property::Generator, e.text());
0200                 } else if (localName == QLatin1String("creation-date")) {
0201                     QDateTime dt = ExtractorPlugin::dateTimeFromString(e.text());
0202                     if (!dt.isNull()) {
0203                         result->add(Property::CreationDate, dt);
0204                     }
0205                 }
0206             }
0207         }
0208         n = n.nextSibling();
0209     }
0210 }
0211 
0212 void OdfExtractor::extractPlainText(QIODevice *device, ExtractionResult *result)
0213 {
0214     bool inOfficeBody = false;
0215 
0216     QXmlStreamReader xml(device);
0217     while (!xml.atEnd()) {
0218         xml.readNext();
0219 
0220         if (xml.isStartElement() && !inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
0221             inOfficeBody = true;
0222         } else if (xml.isEndElement() && inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
0223             break;
0224         }
0225 
0226         if (inOfficeBody && xml.isCharacters() && !xml.isWhitespace()) {
0227             const QString str = xml.text().toString();
0228             result->append(str);
0229         }
0230 
0231         if (xml.hasError() || xml.isEndDocument()) {
0232             break;
0233         }
0234     }
0235 }
0236 
0237 #include "moc_odfextractor.cpp"