File indexing completed on 2024-04-28 15:35:51

0001 // SPDX-FileCopyrightText: 2018 Martin T. H. Sandsmark <martin.sandsmark@kde.org>
0002 // SPDX-FileCopyrightText: 2021 Carl Schwan <carl@carlschwan.eu>
0003 // SPDX-License-Identifier: BSD-3-Clause
0004 
0005 #include "epubcontainer.h"
0006 
0007 #include <KArchiveDirectory>
0008 #include <KArchiveFile>
0009 
0010 #include <QDebug>
0011 #include <QDir>
0012 #include <QDomDocument>
0013 #include <QImage>
0014 #include <QImageReader>
0015 #include <QScopedPointer>
0016 
0017 #define METADATA_FOLDER QStringLiteral("META-INF")
0018 #define MIMETYPE_FILE QStringLiteral("mimetype")
0019 #define CONTAINER_FILE QStringLiteral("META-INF/container.xml")
0020 
0021 EPubContainer::EPubContainer(QObject *parent)
0022     : QObject(parent)
0023     , m_rootFolder(nullptr)
0024 {
0025 }
0026 
0027 EPubContainer::~EPubContainer() = default;
0028 
0029 bool EPubContainer::openFile(const QString &path)
0030 {
0031     m_archive = std::make_unique<KZip>(path);
0032 
0033     if (!m_archive->open(QIODevice::ReadOnly)) {
0034         Q_EMIT errorOccured(tr("Failed to open %1").arg(path));
0035 
0036         return false;
0037     }
0038 
0039     m_rootFolder = m_archive->directory();
0040     if (!m_rootFolder) {
0041         Q_EMIT errorOccured(tr("Failed to read %1").arg(path));
0042         return false;
0043     }
0044 
0045     if (!parseMimetype()) {
0046         return false;
0047     }
0048 
0049     if (!parseContainer()) {
0050         return false;
0051     }
0052 
0053     return true;
0054 }
0055 
0056 QSharedPointer<QIODevice> EPubContainer::getIoDevice(const QString &path)
0057 {
0058     const KArchiveFile *file = getFile(path);
0059     if (!file) {
0060         qWarning() << QStringLiteral("Unable to open file %1").arg(path.left(100));
0061         Q_EMIT errorOccured(tr("Unable to open file %1").arg(path.left(100)));
0062         return QSharedPointer<QIODevice>();
0063     }
0064 
0065     return QSharedPointer<QIODevice>(file->createDevice());
0066 }
0067 
0068 QImage EPubContainer::getImage(const QString &id)
0069 {
0070     if (!m_items.contains(id)) {
0071         qWarning() << "Asked for unknown item" << id << m_items.keys();
0072         return {};
0073     }
0074 
0075     const EpubItem &item = m_items.value(id);
0076 
0077     if (!QImageReader::supportedMimeTypes().contains(item.mimetype)) {
0078         qWarning() << "Asked for unsupported type" << item.mimetype;
0079         return {};
0080     }
0081 
0082     QSharedPointer<QIODevice> ioDevice = getIoDevice(item.path);
0083 
0084     if (!ioDevice) {
0085         return {};
0086     }
0087 
0088     return QImage::fromData(ioDevice->readAll());
0089 }
0090 
0091 QStringList EPubContainer::getMetadata(const QString &key)
0092 {
0093     return m_metadata.value(key);
0094 }
0095 
0096 bool EPubContainer::parseMimetype()
0097 {
0098     Q_ASSERT(m_rootFolder);
0099 
0100     const KArchiveFile *mimetypeFile = m_rootFolder->file(MIMETYPE_FILE);
0101 
0102     if (!mimetypeFile) {
0103         Q_EMIT errorOccured(tr("Unable to find mimetype in file"));
0104         return false;
0105     }
0106 
0107     QScopedPointer<QIODevice> ioDevice(mimetypeFile->createDevice());
0108     QByteArray mimetype = ioDevice->readAll();
0109     if (mimetype != "application/epub+zip") {
0110         qWarning() << "Unexpected mimetype" << mimetype;
0111     }
0112 
0113     return true;
0114 }
0115 
0116 bool EPubContainer::parseContainer()
0117 {
0118     Q_ASSERT(m_rootFolder);
0119 
0120     const KArchiveFile *containerFile = getFile(CONTAINER_FILE);
0121     if (!containerFile) {
0122         qWarning() << "no container file";
0123         Q_EMIT errorOccured(tr("Unable to find container information"));
0124         return false;
0125     }
0126 
0127     QScopedPointer<QIODevice> ioDevice(containerFile->createDevice());
0128     Q_ASSERT(ioDevice);
0129 
0130     // The only thing we need from this file is the path to the root file
0131     QDomDocument document;
0132     document.setContent(ioDevice.data());
0133     QDomNodeList rootNodes = document.elementsByTagName(QStringLiteral("rootfile"));
0134     for (int i = 0; i < rootNodes.count(); i++) {
0135         QDomElement rootElement = rootNodes.at(i).toElement();
0136         QString rootfilePath = rootElement.attribute(QStringLiteral("full-path"));
0137         if (rootfilePath.isEmpty()) {
0138             qWarning() << "Invalid root file entry";
0139             continue;
0140         }
0141         if (parseContentFile(rootfilePath)) {
0142             return true;
0143         }
0144     }
0145 
0146     // Limitations:
0147     //  - We only read one rootfile
0148     //  - We don't read the following from META-INF/
0149     //     - manifest.xml (unknown contents, just reserved)
0150     //     - metadata.xml (unused according to spec, just reserved)
0151     //     - rights.xml (reserved for DRM, not standardized)
0152     //     - signatures.xml (signatures for files, standardized)
0153 
0154     Q_EMIT errorOccured(tr("Unable to find and use any content files"));
0155     return false;
0156 }
0157 
0158 bool EPubContainer::parseContentFile(const QString &filepath)
0159 {
0160     const KArchiveFile *rootFile = getFile(filepath);
0161     if (!rootFile) {
0162         Q_EMIT errorOccured(tr("Malformed metadata, unable to get content metadata path"));
0163         return false;
0164     }
0165     QScopedPointer<QIODevice> ioDevice(rootFile->createDevice());
0166     QDomDocument document;
0167     document.setContent(ioDevice.data(), true); // turn on namespace processing
0168 
0169     QDomNodeList metadataNodeList = document.elementsByTagName(QStringLiteral("metadata"));
0170     for (int i = 0; i < metadataNodeList.count(); i++) {
0171         QDomNodeList metadataChildList = metadataNodeList.at(i).childNodes();
0172         for (int j = 0; j < metadataChildList.count(); j++) {
0173             parseMetadataItem(metadataChildList.at(j), metadataChildList);
0174         }
0175     }
0176 
0177     // Extract current path, for resolving relative paths
0178     QString contentFileFolder;
0179     int separatorIndex = filepath.lastIndexOf(QLatin1Char('/'));
0180     if (separatorIndex > 0) {
0181         contentFileFolder = filepath.left(separatorIndex + 1);
0182     }
0183 
0184     // Parse out all the components/items in the epub
0185     QDomNodeList manifestNodeList = document.elementsByTagName(QStringLiteral("manifest"));
0186     for (int i = 0; i < manifestNodeList.count(); i++) {
0187         QDomElement manifestElement = manifestNodeList.at(i).toElement();
0188         QDomNodeList manifestItemList = manifestElement.elementsByTagName(QStringLiteral("item"));
0189 
0190         for (int j = 0; j < manifestItemList.count(); j++) {
0191             parseManifestItem(manifestItemList.at(j), contentFileFolder);
0192         }
0193     }
0194 
0195     // Parse out the document order
0196     QDomNodeList spineNodeList = document.elementsByTagName(QStringLiteral("spine"));
0197     for (int i = 0; i < spineNodeList.count(); i++) {
0198         QDomElement spineElement = spineNodeList.at(i).toElement();
0199 
0200         QString tocId = spineElement.attribute(QStringLiteral("toc"));
0201         if (!tocId.isEmpty() && m_items.contains(tocId)) {
0202             EpubPageReference tocReference;
0203             tocReference.title = tr("Table of Contents");
0204             tocReference.target = tocId;
0205             m_standardReferences.insert(EpubPageReference::TableOfContents, tocReference);
0206         }
0207 
0208         QDomNodeList spineItemList = spineElement.elementsByTagName(QStringLiteral("itemref"));
0209         for (int j = 0; j < spineItemList.count(); j++) {
0210             parseSpineItem(spineItemList.at(j));
0211         }
0212     }
0213 
0214     // Parse out standard items
0215     QDomNodeList guideNodeList = document.elementsByTagName(QStringLiteral("guide"));
0216     for (int i = 0; i < guideNodeList.count(); i++) {
0217         QDomElement guideElement = guideNodeList.at(i).toElement();
0218 
0219         QDomNodeList guideItemList = guideElement.elementsByTagName(QStringLiteral("reference"));
0220         for (int j = 0; j < guideItemList.count(); j++) {
0221             parseGuideItem(guideItemList.at(j));
0222         }
0223     }
0224 
0225     return true;
0226 }
0227 
0228 bool EPubContainer::parseMetadataPropertyItem(const QDomElement &metadataElement, const QDomNodeList &nodeList)
0229 {
0230     if (metadataElement.attribute(QStringLiteral("property")) == QStringLiteral("belongs-to-collection")) {
0231         const QString id = QStringLiteral("#") + metadataElement.attribute(QStringLiteral("id"));
0232         const QString name = metadataElement.text();
0233         Collection::Type type = Collection::Type::Unknow;
0234         size_t position = 0;
0235 
0236         if (id.length() == 1) {
0237             m_collections.append(Collection{name, type, position});
0238             return true;
0239         }
0240 
0241         for (int i = 0; i < nodeList.size(); i++) {
0242             const auto node = nodeList.at(i);
0243             const auto element = node.toElement();
0244             if (element.tagName() != QStringLiteral("meta")) {
0245                 continue;
0246             }
0247 
0248             if (element.attribute(QStringLiteral("refines")) != id) {
0249                 continue;
0250             }
0251 
0252             if (element.attribute(QStringLiteral("property")) == QStringLiteral("collection-type")) {
0253                 const auto typeString = element.text();
0254                 if (typeString == QStringLiteral("set")) {
0255                     type = Collection::Type::Set;
0256                 } else if (typeString == QStringLiteral("series")) {
0257                     type = Collection::Type::Series;
0258                 }
0259                 continue;
0260             }
0261 
0262             if (element.attribute(QStringLiteral("property")) == QStringLiteral("group-position")) {
0263                 position = element.text().toInt();
0264                 continue;
0265             }
0266         }
0267 
0268         m_collections.append(Collection{name, type, position});
0269         return true;
0270     }
0271 
0272     return false;
0273 }
0274 
0275 bool EPubContainer::parseMetadataItem(const QDomNode &metadataNode, const QDomNodeList &nodeList)
0276 {
0277     QDomElement metadataElement = metadataNode.toElement();
0278     QString tagName = metadataElement.tagName();
0279 
0280     QString metaName;
0281     QString metaValue;
0282 
0283     if (tagName == QStringLiteral("meta")) {
0284         bool foundProperty = parseMetadataPropertyItem(metadataElement, nodeList);
0285         if (foundProperty) {
0286             return true;
0287         }
0288         metaName = metadataElement.attribute(QStringLiteral("name"));
0289         metaValue = metadataElement.attribute(QStringLiteral("content"));
0290     } else if (metadataElement.prefix() != QStringLiteral("dc")) {
0291         qWarning() << "Unsupported metadata tag" << tagName;
0292         return false;
0293     } else if (tagName == QStringLiteral("date")) {
0294         metaName = metadataElement.attribute(QStringLiteral("event"));
0295         metaValue = metadataElement.text();
0296     } else {
0297         metaName = tagName;
0298         metaValue = metadataElement.text();
0299     }
0300 
0301     if (metaName.isEmpty() || metaValue.isEmpty()) {
0302         return false;
0303     }
0304     if (!m_metadata.contains(metaName)) {
0305         m_metadata[metaName] = QStringList{};
0306     }
0307 
0308     if (metaName != QStringLiteral("subject")) {
0309         m_metadata[metaName].append(metaValue);
0310         return true;
0311     }
0312 
0313     if (metaValue.contains(QStringLiteral("--"))) {
0314         const auto metaValues = metaValue.split(QStringLiteral("--"));
0315         if (metaValues.count() <= 1) {
0316             return false;
0317         }
0318 
0319         metaValue = metaValues[metaValues.count() - 1].trimmed();
0320     }
0321 
0322     if (!m_metadata[metaName].contains(metaValue)) {
0323         m_metadata[metaName].append(metaValue);
0324         return true;
0325     }
0326 
0327     return false;
0328 }
0329 
0330 bool EPubContainer::parseManifestItem(const QDomNode &manifestNode, const QString &currentFolder)
0331 {
0332     QDomElement manifestElement = manifestNode.toElement();
0333     QString id = manifestElement.attribute(QStringLiteral("id"));
0334     QString path = manifestElement.attribute(QStringLiteral("href"));
0335     QString type = manifestElement.attribute(QStringLiteral("media-type"));
0336 
0337     if (id.isEmpty() || path.isEmpty()) {
0338         qWarning() << "Invalid item at line" << manifestElement.lineNumber();
0339         return false;
0340     }
0341 
0342     // Resolve relative paths
0343     path = QDir::cleanPath(currentFolder + path);
0344 
0345     EpubItem item;
0346     item.mimetype = type.toUtf8();
0347     item.path = path;
0348     m_items[id] = item;
0349 
0350     static QSet<QString> documentTypes(
0351         {QStringLiteral("text/x-oeb1-document"), QStringLiteral("application/x-dtbook+xml"), QStringLiteral("application/xhtml+xml")});
0352     // All items not listed in the spine should be in this
0353     if (documentTypes.contains(type)) {
0354         m_unorderedItems.insert(id);
0355     }
0356 
0357     return true;
0358 }
0359 
0360 bool EPubContainer::parseSpineItem(const QDomNode &spineNode)
0361 {
0362     QDomElement spineElement = spineNode.toElement();
0363 
0364     // Ignore this for now
0365     if (spineElement.attribute(QStringLiteral("linear")) == QStringLiteral("no")) {
0366         //        return true;
0367     }
0368 
0369     QString referenceName = spineElement.attribute(QStringLiteral("idref"));
0370     if (referenceName.isEmpty()) {
0371         qWarning() << "Invalid spine item at line" << spineNode.lineNumber();
0372         return false;
0373     }
0374 
0375     if (!m_items.contains(referenceName)) {
0376         qWarning() << "Unable to find" << referenceName << "in items";
0377         return false;
0378     }
0379 
0380     m_unorderedItems.remove(referenceName);
0381     m_orderedItems.append(referenceName);
0382 
0383     return true;
0384 }
0385 
0386 bool EPubContainer::parseGuideItem(const QDomNode &guideItem)
0387 {
0388     QDomElement guideElement = guideItem.toElement();
0389     QString target = guideElement.attribute(QStringLiteral("href"));
0390     QString title = guideElement.attribute(QStringLiteral("title"));
0391     QString type = guideElement.attribute(QStringLiteral("type"));
0392 
0393     if (target.isEmpty() || title.isEmpty() || type.isEmpty()) {
0394         qWarning() << "Invalid guide item" << target << title << type;
0395         return false;
0396     }
0397 
0398     EpubPageReference reference;
0399     reference.target = target;
0400     reference.title = title;
0401 
0402     EpubPageReference::StandardType standardType = EpubPageReference::typeFromString(type);
0403     if (standardType == EpubPageReference::Other) {
0404         m_otherReferences[type] = reference;
0405     } else {
0406         m_standardReferences[standardType] = reference;
0407     }
0408 
0409     return true;
0410 }
0411 
0412 const KArchiveFile *EPubContainer::getFile(const QString &path)
0413 {
0414     if (path.isEmpty()) {
0415         return nullptr;
0416     }
0417 
0418     const KArchiveDirectory *folder = m_rootFolder;
0419 
0420     // Try to walk down the correct path
0421     QStringList pathParts = path.split(QLatin1Char('/'), Qt::SkipEmptyParts);
0422     for (int i = 0; i < pathParts.count() - 1; i++) {
0423         QString folderName = pathParts[i];
0424         const KArchiveEntry *entry = folder->entry(folderName);
0425         if (!entry) {
0426             qWarning() << "Unable to find folder name" << folderName << "in" << path.left(100);
0427             const QStringList entries = folder->entries();
0428             for (const QString &folderEntry : entries) {
0429                 if (folderEntry.compare(folderName, Qt::CaseInsensitive) == 0) {
0430                     entry = folder->entry(folderEntry);
0431                     break;
0432                 }
0433             }
0434 
0435             if (!entry) {
0436                 qWarning() << "Didn't even find with case-insensitive matching";
0437                 return nullptr;
0438             }
0439         }
0440 
0441         if (!entry->isDirectory()) {
0442             qWarning() << "Expected" << folderName << "to be a directory in path" << path;
0443             return nullptr;
0444         }
0445 
0446         folder = dynamic_cast<const KArchiveDirectory *>(entry);
0447         Q_ASSERT(folder);
0448     }
0449 
0450     QString filename;
0451     if (pathParts.isEmpty()) {
0452         filename = path;
0453     } else {
0454         filename = pathParts.last();
0455     }
0456 
0457     const KArchiveFile *file = folder->file(filename);
0458     if (!file) {
0459         qWarning() << "Unable to find file" << filename << "in" << folder->name();
0460 
0461         const QStringList entries = folder->entries();
0462         for (const QString &folderEntry : entries) {
0463             if (folderEntry.compare(filename, Qt::CaseInsensitive) == 0) {
0464                 file = folder->file(folderEntry);
0465                 break;
0466             }
0467         }
0468 
0469         if (!file) {
0470             qWarning() << "Unable to find file" << filename << "in" << folder->name() << "with case-insensitive matching" << entries;
0471         }
0472     }
0473     return file;
0474 }
0475 
0476 EpubPageReference::StandardType EpubPageReference::typeFromString(const QString &name)
0477 {
0478     if (name == QStringLiteral("cover")) {
0479         return CoverPage;
0480     } else if (name == QStringLiteral("title-page")) {
0481         return TitlePage;
0482     } else if (name == QStringLiteral("toc")) {
0483         return TableOfContents;
0484     } else if (name == QStringLiteral("index")) {
0485         return Index;
0486     } else if (name == QStringLiteral("glossary")) {
0487         return Glossary;
0488     } else if (name == QStringLiteral("acknowledgements")) {
0489         return Acknowledgements;
0490     } else if (name == QStringLiteral("bibliography")) {
0491         return Bibliography;
0492     } else if (name == QStringLiteral("colophon")) {
0493         return Colophon;
0494     } else if (name == QStringLiteral("copyright-page")) {
0495         return CopyrightPage;
0496     } else if (name == QStringLiteral("dedication")) {
0497         return Dedication;
0498     } else if (name == QStringLiteral("epigraph")) {
0499         return Epigraph;
0500     } else if (name == QStringLiteral("foreword")) {
0501         return Foreword;
0502     } else if (name == QStringLiteral("loi")) {
0503         return ListOfIllustrations;
0504     } else if (name == QStringLiteral("lot")) {
0505         return ListOfTables;
0506     } else if (name == QStringLiteral("notes")) {
0507         return Notes;
0508     } else if (name == QStringLiteral("preface")) {
0509         return Preface;
0510     } else if (name == QStringLiteral("text")) {
0511         return Text;
0512     } else {
0513         return Other;
0514     }
0515 }
0516 
0517 QList<Collection> EPubContainer::collections() const
0518 {
0519     return m_collections;
0520 }
0521 
0522 #include "moc_epubcontainer.cpp"