File indexing completed on 2025-03-16 12:49:35
0001 /* 0002 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in> 0003 SPDX-FileCopyrightText: 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de> 0004 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org> 0005 0006 SPDX-License-Identifier: LGPL-2.1-or-later 0007 */ 0008 0009 0010 #include "odfextractor.h" 0011 #include <memory> 0012 0013 #include <KZip> 0014 0015 #include <QDebug> 0016 #include <QDomDocument> 0017 #include <QFile> 0018 #include <QXmlStreamReader> 0019 0020 namespace { 0021 0022 inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); } 0023 inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); } 0024 inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); } 0025 inline QString bodyTag() { return QStringLiteral("body"); } 0026 0027 QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName) 0028 { 0029 for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) { 0030 if (e.localName() == localName && e.namespaceURI() == nsURI) { 0031 return e; 0032 } 0033 } 0034 0035 return QDomElement(); 0036 } 0037 0038 const QStringList supportedMimeTypes = { 0039 QStringLiteral("application/vnd.oasis.opendocument.text"), 0040 QStringLiteral("application/vnd.oasis.opendocument.text-template"), 0041 QStringLiteral("application/vnd.oasis.opendocument.text-master"), 0042 QStringLiteral("application/vnd.oasis.opendocument.text-master-template"), 0043 QStringLiteral("application/vnd.oasis.opendocument.text-flat-xml"), 0044 QStringLiteral("application/vnd.oasis.opendocument.presentation"), 0045 QStringLiteral("application/vnd.oasis.opendocument.presentation-template"), 0046 QStringLiteral("application/vnd.oasis.opendocument.presentation-flat-xml"), 0047 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"), 0048 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-template"), 0049 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-flat-xml"), 0050 QStringLiteral("application/vnd.oasis.opendocument.graphics"), 0051 QStringLiteral("application/vnd.oasis.opendocument.graphics-template"), 0052 QStringLiteral("application/vnd.oasis.opendocument.graphics-flat-xml"), 0053 }; 0054 0055 } 0056 0057 using namespace KFileMetaData; 0058 0059 OdfExtractor::OdfExtractor(QObject* parent) 0060 : ExtractorPlugin(parent) 0061 { 0062 0063 } 0064 0065 QStringList OdfExtractor::mimetypes() const 0066 { 0067 return supportedMimeTypes; 0068 } 0069 0070 void OdfExtractor::extract(ExtractionResult* result) 0071 { 0072 if (result->inputMimetype().endsWith(QLatin1String("-flat-xml"))) { 0073 QFile file(result->inputUrl()); 0074 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { 0075 return; 0076 } 0077 0078 result->addType(Type::Document); 0079 if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-flat-xml")) { 0080 result->addType(Type::Presentation); 0081 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-flat-xml")) { 0082 result->addType(Type::Spreadsheet); 0083 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-flat-xml")) { 0084 result->addType(Type::Image); 0085 } 0086 0087 if (result->inputFlags() & ExtractionResult::ExtractMetaData) { 0088 parseMetaData(QStringLiteral("document"), file.readAll(), result); 0089 } 0090 0091 if (result->inputFlags() & ExtractionResult::ExtractPlainText) { 0092 file.seek(0); 0093 extractPlainText(&file, result); 0094 } 0095 0096 return; 0097 } 0098 0099 KZip zip(result->inputUrl()); 0100 if (!zip.open(QIODevice::ReadOnly)) { 0101 qWarning() << "Document is not a valid ZIP archive"; 0102 return; 0103 } 0104 0105 const KArchiveDirectory* directory = zip.directory(); 0106 if (!directory) { 0107 qWarning() << "Invalid document structure (main directory is missing)"; 0108 return; 0109 } 0110 0111 // we need a meta xml file in the archive! 0112 const auto metaXml = directory->file(QStringLiteral("meta.xml")); 0113 if (!metaXml) { 0114 qWarning() << "Invalid document structure (meta.xml is missing)"; 0115 return; 0116 } 0117 0118 if (result->inputFlags() & ExtractionResult::ExtractMetaData) { 0119 parseMetaData(QStringLiteral("document-meta"), metaXml->data(), result); 0120 } 0121 0122 result->addType(Type::Document); 0123 if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) || 0124 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-template"))) { 0125 result->addType(Type::Presentation); 0126 } 0127 else if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) || 0128 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-template"))) { 0129 result->addType(Type::Spreadsheet); 0130 } 0131 else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics") || 0132 result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-template")) { 0133 result->addType(Type::Image); 0134 } 0135 0136 if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) { 0137 return; 0138 } 0139 0140 // for content indexing, we need content xml file 0141 const auto contentXml = directory->file(QStringLiteral("content.xml")); 0142 if (!contentXml) { 0143 qWarning() << "Invalid document structure (content.xml is missing)"; 0144 return; 0145 } 0146 0147 std::unique_ptr<QIODevice> contentIODevice{contentXml->createDevice()}; 0148 extractPlainText(contentIODevice.get(), result); 0149 } 0150 0151 void OdfExtractor::parseMetaData(const QString &documentElementId, const QByteArray &data, ExtractionResult *result) 0152 { 0153 QDomDocument metaData(QStringLiteral("metaData")); 0154 metaData.setContent(data, true); 0155 0156 // parse metadata ... 0157 QDomElement meta = firstChildElementNS(firstChildElementNS(metaData, 0158 officeNS(), documentElementId), 0159 officeNS(), QStringLiteral("meta")); 0160 0161 QDomNode n = meta.firstChild(); 0162 while (!n.isNull()) { 0163 QDomElement e = n.toElement(); 0164 if (!e.isNull()) { 0165 const QString namespaceURI = e.namespaceURI(); 0166 const QString localName = e.localName(); 0167 0168 // Dublin Core 0169 if (namespaceURI == dcNS()) { 0170 if (localName == QLatin1String("description")) { 0171 result->add(Property::Description, e.text()); 0172 } else if (localName == QLatin1String("subject")) { 0173 result->add(Property::Subject, e.text()); 0174 } else if (localName == QLatin1String("title")) { 0175 result->add(Property::Title, e.text()); 0176 } else if (localName == QLatin1String("creator")) { 0177 result->add(Property::Author, e.text()); 0178 } else if (localName == QLatin1String("language")) { 0179 result->add(Property::Language, e.text()); 0180 } 0181 } 0182 // Meta Properties 0183 else if (namespaceURI == metaNS()) { 0184 if (localName == QLatin1String("document-statistic")) { 0185 bool ok = false; 0186 int pageCount = e.attributeNS(metaNS(), QStringLiteral("page-count")).toInt(&ok); 0187 if (ok) { 0188 result->add(Property::PageCount, pageCount); 0189 } 0190 0191 int wordCount = e.attributeNS(metaNS(), QStringLiteral("word-count")).toInt(&ok); 0192 if (ok) { 0193 result->add(Property::WordCount, wordCount); 0194 } 0195 } else if (localName == QLatin1String("keyword")) { 0196 QString keywords = e.text(); 0197 result->add(Property::Keywords, keywords); 0198 } else if (localName == QLatin1String("generator")) { 0199 result->add(Property::Generator, e.text()); 0200 } else if (localName == QLatin1String("creation-date")) { 0201 QDateTime dt = ExtractorPlugin::dateTimeFromString(e.text()); 0202 if (!dt.isNull()) { 0203 result->add(Property::CreationDate, dt); 0204 } 0205 } 0206 } 0207 } 0208 n = n.nextSibling(); 0209 } 0210 } 0211 0212 void OdfExtractor::extractPlainText(QIODevice *device, ExtractionResult *result) 0213 { 0214 bool inOfficeBody = false; 0215 0216 QXmlStreamReader xml(device); 0217 while (!xml.atEnd()) { 0218 xml.readNext(); 0219 0220 if (xml.isStartElement() && !inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) { 0221 inOfficeBody = true; 0222 } else if (xml.isEndElement() && inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) { 0223 break; 0224 } 0225 0226 if (inOfficeBody && xml.isCharacters() && !xml.isWhitespace()) { 0227 const QString str = xml.text().toString(); 0228 result->append(str); 0229 } 0230 0231 if (xml.hasError() || xml.isEndDocument()) { 0232 break; 0233 } 0234 } 0235 } 0236 0237 #include "moc_odfextractor.cpp"