File indexing completed on 2024-05-12 15:37:05

0001 /*
0002     SPDX-FileCopyrightText: 2022 Kai Uwe Broulik <kde@broulik.de>
0003 
0004     SPDX-License-Identifier: LGPL-2.1-or-later
0005 */
0006 
0007 #include "fb2extractor.h"
0008 #include "kfilemetadata_debug.h"
0009 
0010 #include <QDateTime>
0011 #include <QFile>
0012 #include <QXmlStreamReader>
0013 
0014 #include <KZip>
0015 
0016 #include <memory>
0017 
0018 using namespace KFileMetaData;
0019 
0020 Fb2Extractor::Fb2Extractor(QObject *parent)
0021     : ExtractorPlugin(parent)
0022 {
0023 }
0024 
0025 namespace
0026 {
0027 static const QString regularMimeType()
0028 {
0029     return QStringLiteral("application/x-fictionbook+xml");
0030 }
0031 
0032 static const QString compressedMimeType()
0033 {
0034     return QStringLiteral("application/x-zip-compressed-fb2");
0035 }
0036 
0037 static const QStringList supportedMimeTypes = {regularMimeType(), compressedMimeType()};
0038 
0039 }
0040 
0041 QStringList Fb2Extractor::mimetypes() const
0042 {
0043     return supportedMimeTypes;
0044 }
0045 
0046 void Fb2Extractor::extract(ExtractionResult *result)
0047 {
0048     std::unique_ptr<QIODevice> device;
0049     std::unique_ptr<KZip> zip;
0050 
0051     if (result->inputMimetype() == regularMimeType()) {
0052         device.reset(new QFile(result->inputUrl()));
0053         if (!device->open(QIODevice::ReadOnly | QIODevice::Text)) {
0054             return;
0055         }
0056 
0057     } else if (result->inputMimetype() == compressedMimeType()) {
0058         zip.reset(new KZip(result->inputUrl()));
0059         if (!zip->open(QIODevice::ReadOnly)) {
0060             return;
0061         }
0062 
0063         const auto entries = zip->directory()->entries();
0064         if (entries.count() != 1) {
0065             return;
0066         }
0067 
0068         const QString entryPath = entries.first();
0069         if (!entryPath.endsWith(QLatin1String(".fb2"))) {
0070             return;
0071         }
0072 
0073         const auto *entry = zip->directory()->file(entryPath);
0074         if (!entry) {
0075             return;
0076         }
0077 
0078         device.reset(entry->createDevice());
0079     }
0080 
0081     result->addType(Type::Document);
0082 
0083     QXmlStreamReader xml(device.get());
0084 
0085     bool inFictionBook = false;
0086     bool inDescription = false;
0087     bool inTitleInfo = false;
0088     bool inAuthor = false;
0089     bool inDocumentInfo = false;
0090     bool inPublishInfo = false;
0091     bool inBody = false;
0092 
0093     QString authorFirstName;
0094     QString authorMiddleName;
0095     QString authorLastName;
0096     QString authorNickName;
0097 
0098     while (!xml.atEnd() && !xml.hasError()) {
0099         xml.readNext();
0100 
0101         if (xml.name() == QLatin1String("FictionBook")) {
0102             if (xml.isStartElement()) {
0103                 inFictionBook = true;
0104             } else if (xml.isEndElement()) {
0105                 break;
0106             }
0107         } else if (xml.name() == QLatin1String("description")) {
0108             if (xml.isStartElement()) {
0109                 inDescription = true;
0110             } else if (xml.isEndElement()) {
0111                 inDescription = false;
0112             }
0113         } else if (xml.name() == QLatin1String("title-info")) {
0114             if (xml.isStartElement()) {
0115                 inTitleInfo = true;
0116             } else if (xml.isEndElement()) {
0117                 inTitleInfo = false;
0118             }
0119         } else if (xml.name() == QLatin1String("document-info")) {
0120             if (xml.isStartElement()) {
0121                 inDocumentInfo = true;
0122             } else if (xml.isEndElement()) {
0123                 inDocumentInfo = false;
0124             }
0125         } else if (xml.name() == QLatin1String("publish-info")) {
0126             if (xml.isStartElement()) {
0127                 inPublishInfo = true;
0128             } else if (xml.isEndElement()) {
0129                 inPublishInfo = false;
0130             }
0131         } else if (xml.name() == QLatin1String("body")) {
0132             if (xml.isStartElement()) {
0133                 inBody = true;
0134             } else if (xml.isEndElement()) {
0135                 inBody = false;
0136             }
0137         }
0138 
0139         if (!inFictionBook) {
0140             continue;
0141         }
0142 
0143         if (inDescription && result->inputFlags() & ExtractionResult::ExtractMetaData) {
0144             if (inTitleInfo) {
0145                 if (xml.isStartElement()) {
0146                     if (xml.name() == QLatin1String("author")) {
0147                         inAuthor = true;
0148                     } else if (inAuthor) {
0149                         if (xml.name() == QLatin1String("first-name")) {
0150                             authorFirstName = xml.readElementText();
0151                         } else if (xml.name() == QLatin1String("middle-name")) {
0152                             authorMiddleName = xml.readElementText();
0153                         } else if (xml.name() == QLatin1String("last-name")) {
0154                             authorLastName = xml.readElementText();
0155                         } else if (xml.name() == QLatin1String("nickname")) {
0156                             authorNickName = xml.readElementText();
0157                         }
0158                     } else if (xml.name() == QLatin1String("book-title")) {
0159                         result->add(Property::Title, xml.readElementText());
0160                     } else if (xml.name() == QLatin1String("annotation")) {
0161                         result->add(Property::Description, xml.readElementText(QXmlStreamReader::IncludeChildElements).trimmed());
0162                     } else if (xml.name() == QLatin1String("lang")) {
0163                         result->add(Property::Language, xml.readElementText());
0164                     } else if (xml.name() == QLatin1String("genre")) {
0165                         result->add(Property::Genre, xml.readElementText());
0166                     }
0167                 } else if (xml.isEndElement()) {
0168                     inAuthor = false;
0169 
0170                     QStringList nameParts = {authorFirstName, authorMiddleName, authorLastName};
0171                     nameParts.removeAll(QString());
0172 
0173                     if (!nameParts.isEmpty()) {
0174                         result->add(Property::Author, nameParts.join(QLatin1Char(' ')));
0175                     } else if (!authorNickName.isEmpty()) {
0176                         result->add(Property::Author, authorNickName);
0177                     }
0178 
0179                     authorFirstName.clear();
0180                     authorMiddleName.clear();
0181                     authorLastName.clear();
0182                     authorNickName.clear();
0183                 }
0184             } else if (inDocumentInfo) {
0185                 if (xml.name() == QLatin1String("date")) {
0186                     // Date can be "not exact" but date "value", if present, is an xs:date
0187                     const auto dateValue = xml.attributes().value(QLatin1String("value"));
0188                     QDateTime dt = QDateTime::fromString(dateValue.toString());
0189 
0190                     if (!dt.isValid()) {
0191                         dt = ExtractorPlugin::dateTimeFromString(xml.readElementText());
0192                     }
0193 
0194                     if (dt.isValid()) {
0195                         result->add(Property::CreationDate, dt);
0196                     }
0197                 } else if (xml.name() == QLatin1String("program-used")) {
0198                     result->add(Property::Generator, xml.readElementText());
0199                     // "Owner of the fb2 document copyrights"
0200                 } else if (xml.name() == QLatin1String("publisher")) {
0201                     result->add(Property::Copyright, xml.readElementText());
0202                 }
0203             } else if (inPublishInfo) {
0204                 if (xml.name() == QLatin1String("publisher")) {
0205                     result->add(Property::Publisher, xml.readElementText());
0206                 } else if (xml.name() == QLatin1String("year")) {
0207                     bool ok;
0208                     const int releaseYear = xml.readElementText().toInt(&ok);
0209                     if (ok) {
0210                         result->add(Property::ReleaseYear, releaseYear);
0211                     }
0212                 }
0213             }
0214         } else if (inBody && result->inputFlags() & ExtractionResult::ExtractPlainText && xml.isCharacters() && !xml.isWhitespace()) {
0215             result->append(xml.text().toString());
0216         }
0217     }
0218 }
0219 
0220 #include "moc_fb2extractor.cpp"