File indexing completed on 2024-05-12 15:37:04

0001 /*
0002     SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
0003     SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-or-later
0006 */
0007 
0008 
0009 #include "epubextractor.h"
0010 #include "kfilemetadata_debug.h"
0011 
0012 #include <epub.h>
0013 
0014 #include <QDateTime>
0015 #include <QRegularExpression>
0016 
0017 using namespace KFileMetaData;
0018 
0019 EPubExtractor::EPubExtractor(QObject* parent)
0020     : ExtractorPlugin(parent)
0021 {
0022 
0023 }
0024 
0025 namespace
0026 {
0027 static const QStringList supportedMimeTypes = {
0028     QStringLiteral("application/epub+zip"),
0029 };
0030 
0031 const QStringList fetchMetadata(struct epub* e, const epub_metadata& type)
0032 {
0033     int size = 0;
0034     unsigned char** data = epub_get_metadata(e, type, &size);
0035     if (data) {
0036         QStringList strList;
0037         strList.reserve(size);
0038         for (int i = 0; i < size; i++) {
0039             // skip nullptr entries, can happen for broken xml files
0040             // also skip empty entries
0041             if (!data[i] || !data[i][0]) {
0042                 continue;
0043             }
0044 
0045             strList << QString::fromUtf8((char*)data[i]);
0046             free(data[i]);
0047         }
0048         free(data);
0049 
0050         return strList;
0051     }
0052     return QStringList();
0053 }
0054 }
0055 
0056 QStringList EPubExtractor::mimetypes() const
0057 {
0058     return supportedMimeTypes;
0059 }
0060 
0061 void EPubExtractor::extract(ExtractionResult* result)
0062 {
0063     // open epub, return on exit, file will be closed again at end of function
0064     auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1);
0065     if (!ePubDoc) {
0066         qCWarning(KFILEMETADATA_LOG) << "Invalid document";
0067         return;
0068     }
0069 
0070     result->addType(Type::Document);
0071 
0072     if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
0073 
0074         for (const QString& value : fetchMetadata(ePubDoc, EPUB_TITLE)) {
0075             result->add(Property::Title, value);
0076         }
0077 
0078         for (const QString& value : fetchMetadata(ePubDoc, EPUB_SUBJECT)) {
0079             result->add(Property::Subject, value);
0080         }
0081 
0082         for (QString value : fetchMetadata(ePubDoc, EPUB_CREATOR)) {
0083             // Prefix added by libepub when no opf:role is specified
0084             if (value.startsWith(QLatin1String("Author: "), Qt::CaseSensitive)) {
0085                 value = value.mid(8).simplified();
0086             } else {
0087                 // Find 'opf:role' prefix added by libepub
0088                 int index = value.indexOf(QLatin1String(": "), Qt::CaseSensitive);
0089                 if (index > 0) {
0090                     value = value.mid(index + 2).simplified();
0091                 }
0092             }
0093 
0094             // Name is provided as "<name>(<file-as>)" when opf:file-as property
0095             // is specified, "<name>(<name>)" otherwise. Strip the last part
0096             int index = value.indexOf(QLatin1Char('('));
0097             if (index > 0) {
0098                 value = value.mid(0, index);
0099             }
0100 
0101             result->add(Property::Author, value);
0102         }
0103 
0104         // The Contributor just seems to be mostly Calibre aka the Generator
0105         /*
0106     value = fetchMetadata(ePubDoc, EPUB_CONTRIB);
0107     if( !value.isEmpty() ) {
0108         SimpleResource con;
0109         con.addType( NCO::Contact() );
0110         con.addProperty( NCO::fullname(), value );
0111 
0112         fileRes.addProperty( NCO::contributor(), con );
0113         graph << con;
0114     }*/
0115 
0116         for (const QString& value : fetchMetadata(ePubDoc, EPUB_PUBLISHER)) {
0117             result->add(Property::Publisher, value);
0118         }
0119 
0120         for (const QString& value : fetchMetadata(ePubDoc, EPUB_DESCRIPTION)) {
0121             result->add(Property::Description, value);
0122         }
0123 
0124         for (QString value : fetchMetadata(ePubDoc, EPUB_DATE)) {
0125             if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) {
0126                 value = value.mid(12).simplified();
0127             } else if (value.startsWith(QLatin1String("publication:"), Qt::CaseInsensitive)) {
0128                 value = value.mid(12).simplified();
0129             } else {
0130                 continue;
0131             }
0132             QDateTime dt = ExtractorPlugin::dateTimeFromString(value);
0133             if (!dt.isNull()) {
0134                 result->add(Property::CreationDate, dt);
0135                 result->add(Property::ReleaseYear, dt.date().year());
0136             }
0137         }
0138     }
0139 
0140     //
0141     // Plain Text
0142     //
0143     if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
0144         if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) {
0145             do {
0146                 char* curr = epub_it_get_curr(iter);
0147                 if (!curr) {
0148                     continue;
0149                 }
0150 
0151                 QString html = QString::fromUtf8(curr);
0152                 html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
0153                 result->append(html);
0154             } while (epub_it_get_next(iter));
0155 
0156             epub_free_iterator(iter);
0157         }
0158 
0159         auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0);
0160         if (!tit) {
0161             tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0);
0162         }
0163         if (tit) {
0164             if (epub_tit_curr_valid(tit)) {
0165                 do {
0166                     // get link, iterator handles freeing of it
0167                     char* clink = epub_tit_get_curr_link(tit);
0168 
0169                     // epub_get_data returns -1 on failure
0170                     char* data = nullptr;
0171                     const int size = epub_get_data(ePubDoc, clink, &data);
0172                     if (size >= 0 && data) {
0173                         QString html = QString::fromUtf8(data, size);
0174                         // strip html tags
0175                         html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
0176 
0177                         result->append(html);
0178                         free(data);
0179                     }
0180                 } while (epub_tit_next(tit));
0181             }
0182             epub_free_titerator(tit);
0183         }
0184     }
0185 
0186     // close epub file again
0187     epub_close(ePubDoc);
0188 }
0189 
0190 #include "moc_epubextractor.cpp"