File indexing completed on 2024-04-28 15:35:51
0001 // SPDX-FileCopyrightText: 2018 Martin T. H. Sandsmark <martin.sandsmark@kde.org> 0002 // SPDX-FileCopyrightText: 2021 Carl Schwan <carl@carlschwan.eu> 0003 // SPDX-License-Identifier: BSD-3-Clause 0004 0005 #include "epubcontainer.h" 0006 0007 #include <KArchiveDirectory> 0008 #include <KArchiveFile> 0009 0010 #include <QDebug> 0011 #include <QDir> 0012 #include <QDomDocument> 0013 #include <QImage> 0014 #include <QImageReader> 0015 #include <QScopedPointer> 0016 0017 #define METADATA_FOLDER QStringLiteral("META-INF") 0018 #define MIMETYPE_FILE QStringLiteral("mimetype") 0019 #define CONTAINER_FILE QStringLiteral("META-INF/container.xml") 0020 0021 EPubContainer::EPubContainer(QObject *parent) 0022 : QObject(parent) 0023 , m_rootFolder(nullptr) 0024 { 0025 } 0026 0027 EPubContainer::~EPubContainer() = default; 0028 0029 bool EPubContainer::openFile(const QString &path) 0030 { 0031 m_archive = std::make_unique<KZip>(path); 0032 0033 if (!m_archive->open(QIODevice::ReadOnly)) { 0034 Q_EMIT errorOccured(tr("Failed to open %1").arg(path)); 0035 0036 return false; 0037 } 0038 0039 m_rootFolder = m_archive->directory(); 0040 if (!m_rootFolder) { 0041 Q_EMIT errorOccured(tr("Failed to read %1").arg(path)); 0042 return false; 0043 } 0044 0045 if (!parseMimetype()) { 0046 return false; 0047 } 0048 0049 if (!parseContainer()) { 0050 return false; 0051 } 0052 0053 return true; 0054 } 0055 0056 QSharedPointer<QIODevice> EPubContainer::getIoDevice(const QString &path) 0057 { 0058 const KArchiveFile *file = getFile(path); 0059 if (!file) { 0060 qWarning() << QStringLiteral("Unable to open file %1").arg(path.left(100)); 0061 Q_EMIT errorOccured(tr("Unable to open file %1").arg(path.left(100))); 0062 return QSharedPointer<QIODevice>(); 0063 } 0064 0065 return QSharedPointer<QIODevice>(file->createDevice()); 0066 } 0067 0068 QImage EPubContainer::getImage(const QString &id) 0069 { 0070 if (!m_items.contains(id)) { 0071 qWarning() << "Asked for unknown item" << id << m_items.keys(); 0072 return {}; 0073 } 0074 0075 const EpubItem &item = m_items.value(id); 0076 0077 if (!QImageReader::supportedMimeTypes().contains(item.mimetype)) { 0078 qWarning() << "Asked for unsupported type" << item.mimetype; 0079 return {}; 0080 } 0081 0082 QSharedPointer<QIODevice> ioDevice = getIoDevice(item.path); 0083 0084 if (!ioDevice) { 0085 return {}; 0086 } 0087 0088 return QImage::fromData(ioDevice->readAll()); 0089 } 0090 0091 QStringList EPubContainer::getMetadata(const QString &key) 0092 { 0093 return m_metadata.value(key); 0094 } 0095 0096 bool EPubContainer::parseMimetype() 0097 { 0098 Q_ASSERT(m_rootFolder); 0099 0100 const KArchiveFile *mimetypeFile = m_rootFolder->file(MIMETYPE_FILE); 0101 0102 if (!mimetypeFile) { 0103 Q_EMIT errorOccured(tr("Unable to find mimetype in file")); 0104 return false; 0105 } 0106 0107 QScopedPointer<QIODevice> ioDevice(mimetypeFile->createDevice()); 0108 QByteArray mimetype = ioDevice->readAll(); 0109 if (mimetype != "application/epub+zip") { 0110 qWarning() << "Unexpected mimetype" << mimetype; 0111 } 0112 0113 return true; 0114 } 0115 0116 bool EPubContainer::parseContainer() 0117 { 0118 Q_ASSERT(m_rootFolder); 0119 0120 const KArchiveFile *containerFile = getFile(CONTAINER_FILE); 0121 if (!containerFile) { 0122 qWarning() << "no container file"; 0123 Q_EMIT errorOccured(tr("Unable to find container information")); 0124 return false; 0125 } 0126 0127 QScopedPointer<QIODevice> ioDevice(containerFile->createDevice()); 0128 Q_ASSERT(ioDevice); 0129 0130 // The only thing we need from this file is the path to the root file 0131 QDomDocument document; 0132 document.setContent(ioDevice.data()); 0133 QDomNodeList rootNodes = document.elementsByTagName(QStringLiteral("rootfile")); 0134 for (int i = 0; i < rootNodes.count(); i++) { 0135 QDomElement rootElement = rootNodes.at(i).toElement(); 0136 QString rootfilePath = rootElement.attribute(QStringLiteral("full-path")); 0137 if (rootfilePath.isEmpty()) { 0138 qWarning() << "Invalid root file entry"; 0139 continue; 0140 } 0141 if (parseContentFile(rootfilePath)) { 0142 return true; 0143 } 0144 } 0145 0146 // Limitations: 0147 // - We only read one rootfile 0148 // - We don't read the following from META-INF/ 0149 // - manifest.xml (unknown contents, just reserved) 0150 // - metadata.xml (unused according to spec, just reserved) 0151 // - rights.xml (reserved for DRM, not standardized) 0152 // - signatures.xml (signatures for files, standardized) 0153 0154 Q_EMIT errorOccured(tr("Unable to find and use any content files")); 0155 return false; 0156 } 0157 0158 bool EPubContainer::parseContentFile(const QString &filepath) 0159 { 0160 const KArchiveFile *rootFile = getFile(filepath); 0161 if (!rootFile) { 0162 Q_EMIT errorOccured(tr("Malformed metadata, unable to get content metadata path")); 0163 return false; 0164 } 0165 QScopedPointer<QIODevice> ioDevice(rootFile->createDevice()); 0166 QDomDocument document; 0167 document.setContent(ioDevice.data(), true); // turn on namespace processing 0168 0169 QDomNodeList metadataNodeList = document.elementsByTagName(QStringLiteral("metadata")); 0170 for (int i = 0; i < metadataNodeList.count(); i++) { 0171 QDomNodeList metadataChildList = metadataNodeList.at(i).childNodes(); 0172 for (int j = 0; j < metadataChildList.count(); j++) { 0173 parseMetadataItem(metadataChildList.at(j), metadataChildList); 0174 } 0175 } 0176 0177 // Extract current path, for resolving relative paths 0178 QString contentFileFolder; 0179 int separatorIndex = filepath.lastIndexOf(QLatin1Char('/')); 0180 if (separatorIndex > 0) { 0181 contentFileFolder = filepath.left(separatorIndex + 1); 0182 } 0183 0184 // Parse out all the components/items in the epub 0185 QDomNodeList manifestNodeList = document.elementsByTagName(QStringLiteral("manifest")); 0186 for (int i = 0; i < manifestNodeList.count(); i++) { 0187 QDomElement manifestElement = manifestNodeList.at(i).toElement(); 0188 QDomNodeList manifestItemList = manifestElement.elementsByTagName(QStringLiteral("item")); 0189 0190 for (int j = 0; j < manifestItemList.count(); j++) { 0191 parseManifestItem(manifestItemList.at(j), contentFileFolder); 0192 } 0193 } 0194 0195 // Parse out the document order 0196 QDomNodeList spineNodeList = document.elementsByTagName(QStringLiteral("spine")); 0197 for (int i = 0; i < spineNodeList.count(); i++) { 0198 QDomElement spineElement = spineNodeList.at(i).toElement(); 0199 0200 QString tocId = spineElement.attribute(QStringLiteral("toc")); 0201 if (!tocId.isEmpty() && m_items.contains(tocId)) { 0202 EpubPageReference tocReference; 0203 tocReference.title = tr("Table of Contents"); 0204 tocReference.target = tocId; 0205 m_standardReferences.insert(EpubPageReference::TableOfContents, tocReference); 0206 } 0207 0208 QDomNodeList spineItemList = spineElement.elementsByTagName(QStringLiteral("itemref")); 0209 for (int j = 0; j < spineItemList.count(); j++) { 0210 parseSpineItem(spineItemList.at(j)); 0211 } 0212 } 0213 0214 // Parse out standard items 0215 QDomNodeList guideNodeList = document.elementsByTagName(QStringLiteral("guide")); 0216 for (int i = 0; i < guideNodeList.count(); i++) { 0217 QDomElement guideElement = guideNodeList.at(i).toElement(); 0218 0219 QDomNodeList guideItemList = guideElement.elementsByTagName(QStringLiteral("reference")); 0220 for (int j = 0; j < guideItemList.count(); j++) { 0221 parseGuideItem(guideItemList.at(j)); 0222 } 0223 } 0224 0225 return true; 0226 } 0227 0228 bool EPubContainer::parseMetadataPropertyItem(const QDomElement &metadataElement, const QDomNodeList &nodeList) 0229 { 0230 if (metadataElement.attribute(QStringLiteral("property")) == QStringLiteral("belongs-to-collection")) { 0231 const QString id = QStringLiteral("#") + metadataElement.attribute(QStringLiteral("id")); 0232 const QString name = metadataElement.text(); 0233 Collection::Type type = Collection::Type::Unknow; 0234 size_t position = 0; 0235 0236 if (id.length() == 1) { 0237 m_collections.append(Collection{name, type, position}); 0238 return true; 0239 } 0240 0241 for (int i = 0; i < nodeList.size(); i++) { 0242 const auto node = nodeList.at(i); 0243 const auto element = node.toElement(); 0244 if (element.tagName() != QStringLiteral("meta")) { 0245 continue; 0246 } 0247 0248 if (element.attribute(QStringLiteral("refines")) != id) { 0249 continue; 0250 } 0251 0252 if (element.attribute(QStringLiteral("property")) == QStringLiteral("collection-type")) { 0253 const auto typeString = element.text(); 0254 if (typeString == QStringLiteral("set")) { 0255 type = Collection::Type::Set; 0256 } else if (typeString == QStringLiteral("series")) { 0257 type = Collection::Type::Series; 0258 } 0259 continue; 0260 } 0261 0262 if (element.attribute(QStringLiteral("property")) == QStringLiteral("group-position")) { 0263 position = element.text().toInt(); 0264 continue; 0265 } 0266 } 0267 0268 m_collections.append(Collection{name, type, position}); 0269 return true; 0270 } 0271 0272 return false; 0273 } 0274 0275 bool EPubContainer::parseMetadataItem(const QDomNode &metadataNode, const QDomNodeList &nodeList) 0276 { 0277 QDomElement metadataElement = metadataNode.toElement(); 0278 QString tagName = metadataElement.tagName(); 0279 0280 QString metaName; 0281 QString metaValue; 0282 0283 if (tagName == QStringLiteral("meta")) { 0284 bool foundProperty = parseMetadataPropertyItem(metadataElement, nodeList); 0285 if (foundProperty) { 0286 return true; 0287 } 0288 metaName = metadataElement.attribute(QStringLiteral("name")); 0289 metaValue = metadataElement.attribute(QStringLiteral("content")); 0290 } else if (metadataElement.prefix() != QStringLiteral("dc")) { 0291 qWarning() << "Unsupported metadata tag" << tagName; 0292 return false; 0293 } else if (tagName == QStringLiteral("date")) { 0294 metaName = metadataElement.attribute(QStringLiteral("event")); 0295 metaValue = metadataElement.text(); 0296 } else { 0297 metaName = tagName; 0298 metaValue = metadataElement.text(); 0299 } 0300 0301 if (metaName.isEmpty() || metaValue.isEmpty()) { 0302 return false; 0303 } 0304 if (!m_metadata.contains(metaName)) { 0305 m_metadata[metaName] = QStringList{}; 0306 } 0307 0308 if (metaName != QStringLiteral("subject")) { 0309 m_metadata[metaName].append(metaValue); 0310 return true; 0311 } 0312 0313 if (metaValue.contains(QStringLiteral("--"))) { 0314 const auto metaValues = metaValue.split(QStringLiteral("--")); 0315 if (metaValues.count() <= 1) { 0316 return false; 0317 } 0318 0319 metaValue = metaValues[metaValues.count() - 1].trimmed(); 0320 } 0321 0322 if (!m_metadata[metaName].contains(metaValue)) { 0323 m_metadata[metaName].append(metaValue); 0324 return true; 0325 } 0326 0327 return false; 0328 } 0329 0330 bool EPubContainer::parseManifestItem(const QDomNode &manifestNode, const QString ¤tFolder) 0331 { 0332 QDomElement manifestElement = manifestNode.toElement(); 0333 QString id = manifestElement.attribute(QStringLiteral("id")); 0334 QString path = manifestElement.attribute(QStringLiteral("href")); 0335 QString type = manifestElement.attribute(QStringLiteral("media-type")); 0336 0337 if (id.isEmpty() || path.isEmpty()) { 0338 qWarning() << "Invalid item at line" << manifestElement.lineNumber(); 0339 return false; 0340 } 0341 0342 // Resolve relative paths 0343 path = QDir::cleanPath(currentFolder + path); 0344 0345 EpubItem item; 0346 item.mimetype = type.toUtf8(); 0347 item.path = path; 0348 m_items[id] = item; 0349 0350 static QSet<QString> documentTypes( 0351 {QStringLiteral("text/x-oeb1-document"), QStringLiteral("application/x-dtbook+xml"), QStringLiteral("application/xhtml+xml")}); 0352 // All items not listed in the spine should be in this 0353 if (documentTypes.contains(type)) { 0354 m_unorderedItems.insert(id); 0355 } 0356 0357 return true; 0358 } 0359 0360 bool EPubContainer::parseSpineItem(const QDomNode &spineNode) 0361 { 0362 QDomElement spineElement = spineNode.toElement(); 0363 0364 // Ignore this for now 0365 if (spineElement.attribute(QStringLiteral("linear")) == QStringLiteral("no")) { 0366 // return true; 0367 } 0368 0369 QString referenceName = spineElement.attribute(QStringLiteral("idref")); 0370 if (referenceName.isEmpty()) { 0371 qWarning() << "Invalid spine item at line" << spineNode.lineNumber(); 0372 return false; 0373 } 0374 0375 if (!m_items.contains(referenceName)) { 0376 qWarning() << "Unable to find" << referenceName << "in items"; 0377 return false; 0378 } 0379 0380 m_unorderedItems.remove(referenceName); 0381 m_orderedItems.append(referenceName); 0382 0383 return true; 0384 } 0385 0386 bool EPubContainer::parseGuideItem(const QDomNode &guideItem) 0387 { 0388 QDomElement guideElement = guideItem.toElement(); 0389 QString target = guideElement.attribute(QStringLiteral("href")); 0390 QString title = guideElement.attribute(QStringLiteral("title")); 0391 QString type = guideElement.attribute(QStringLiteral("type")); 0392 0393 if (target.isEmpty() || title.isEmpty() || type.isEmpty()) { 0394 qWarning() << "Invalid guide item" << target << title << type; 0395 return false; 0396 } 0397 0398 EpubPageReference reference; 0399 reference.target = target; 0400 reference.title = title; 0401 0402 EpubPageReference::StandardType standardType = EpubPageReference::typeFromString(type); 0403 if (standardType == EpubPageReference::Other) { 0404 m_otherReferences[type] = reference; 0405 } else { 0406 m_standardReferences[standardType] = reference; 0407 } 0408 0409 return true; 0410 } 0411 0412 const KArchiveFile *EPubContainer::getFile(const QString &path) 0413 { 0414 if (path.isEmpty()) { 0415 return nullptr; 0416 } 0417 0418 const KArchiveDirectory *folder = m_rootFolder; 0419 0420 // Try to walk down the correct path 0421 QStringList pathParts = path.split(QLatin1Char('/'), Qt::SkipEmptyParts); 0422 for (int i = 0; i < pathParts.count() - 1; i++) { 0423 QString folderName = pathParts[i]; 0424 const KArchiveEntry *entry = folder->entry(folderName); 0425 if (!entry) { 0426 qWarning() << "Unable to find folder name" << folderName << "in" << path.left(100); 0427 const QStringList entries = folder->entries(); 0428 for (const QString &folderEntry : entries) { 0429 if (folderEntry.compare(folderName, Qt::CaseInsensitive) == 0) { 0430 entry = folder->entry(folderEntry); 0431 break; 0432 } 0433 } 0434 0435 if (!entry) { 0436 qWarning() << "Didn't even find with case-insensitive matching"; 0437 return nullptr; 0438 } 0439 } 0440 0441 if (!entry->isDirectory()) { 0442 qWarning() << "Expected" << folderName << "to be a directory in path" << path; 0443 return nullptr; 0444 } 0445 0446 folder = dynamic_cast<const KArchiveDirectory *>(entry); 0447 Q_ASSERT(folder); 0448 } 0449 0450 QString filename; 0451 if (pathParts.isEmpty()) { 0452 filename = path; 0453 } else { 0454 filename = pathParts.last(); 0455 } 0456 0457 const KArchiveFile *file = folder->file(filename); 0458 if (!file) { 0459 qWarning() << "Unable to find file" << filename << "in" << folder->name(); 0460 0461 const QStringList entries = folder->entries(); 0462 for (const QString &folderEntry : entries) { 0463 if (folderEntry.compare(filename, Qt::CaseInsensitive) == 0) { 0464 file = folder->file(folderEntry); 0465 break; 0466 } 0467 } 0468 0469 if (!file) { 0470 qWarning() << "Unable to find file" << filename << "in" << folder->name() << "with case-insensitive matching" << entries; 0471 } 0472 } 0473 return file; 0474 } 0475 0476 EpubPageReference::StandardType EpubPageReference::typeFromString(const QString &name) 0477 { 0478 if (name == QStringLiteral("cover")) { 0479 return CoverPage; 0480 } else if (name == QStringLiteral("title-page")) { 0481 return TitlePage; 0482 } else if (name == QStringLiteral("toc")) { 0483 return TableOfContents; 0484 } else if (name == QStringLiteral("index")) { 0485 return Index; 0486 } else if (name == QStringLiteral("glossary")) { 0487 return Glossary; 0488 } else if (name == QStringLiteral("acknowledgements")) { 0489 return Acknowledgements; 0490 } else if (name == QStringLiteral("bibliography")) { 0491 return Bibliography; 0492 } else if (name == QStringLiteral("colophon")) { 0493 return Colophon; 0494 } else if (name == QStringLiteral("copyright-page")) { 0495 return CopyrightPage; 0496 } else if (name == QStringLiteral("dedication")) { 0497 return Dedication; 0498 } else if (name == QStringLiteral("epigraph")) { 0499 return Epigraph; 0500 } else if (name == QStringLiteral("foreword")) { 0501 return Foreword; 0502 } else if (name == QStringLiteral("loi")) { 0503 return ListOfIllustrations; 0504 } else if (name == QStringLiteral("lot")) { 0505 return ListOfTables; 0506 } else if (name == QStringLiteral("notes")) { 0507 return Notes; 0508 } else if (name == QStringLiteral("preface")) { 0509 return Preface; 0510 } else if (name == QStringLiteral("text")) { 0511 return Text; 0512 } else { 0513 return Other; 0514 } 0515 } 0516 0517 QList<Collection> EPubContainer::collections() const 0518 { 0519 return m_collections; 0520 } 0521 0522 #include "moc_epubcontainer.cpp"