File indexing completed on 2025-03-16 12:49:35
0001 /* 0002 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in> 0003 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.1-or-later 0006 */ 0007 0008 0009 #include "epubextractor.h" 0010 #include "kfilemetadata_debug.h" 0011 0012 #include <epub.h> 0013 0014 #include <QDateTime> 0015 #include <QRegularExpression> 0016 0017 using namespace KFileMetaData; 0018 0019 EPubExtractor::EPubExtractor(QObject* parent) 0020 : ExtractorPlugin(parent) 0021 { 0022 0023 } 0024 0025 namespace 0026 { 0027 static const QStringList supportedMimeTypes = { 0028 QStringLiteral("application/epub+zip"), 0029 }; 0030 0031 const QStringList fetchMetadata(struct epub* e, const epub_metadata& type) 0032 { 0033 int size = 0; 0034 unsigned char** data = epub_get_metadata(e, type, &size); 0035 if (data) { 0036 QStringList strList; 0037 strList.reserve(size); 0038 for (int i = 0; i < size; i++) { 0039 // skip nullptr entries, can happen for broken xml files 0040 // also skip empty entries 0041 if (!data[i] || !data[i][0]) { 0042 continue; 0043 } 0044 0045 strList << QString::fromUtf8((char*)data[i]); 0046 free(data[i]); 0047 } 0048 free(data); 0049 0050 return strList; 0051 } 0052 return QStringList(); 0053 } 0054 } 0055 0056 QStringList EPubExtractor::mimetypes() const 0057 { 0058 return supportedMimeTypes; 0059 } 0060 0061 void EPubExtractor::extract(ExtractionResult* result) 0062 { 0063 // open epub, return on exit, file will be closed again at end of function 0064 auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1); 0065 if (!ePubDoc) { 0066 qCWarning(KFILEMETADATA_LOG) << "Invalid document"; 0067 return; 0068 } 0069 0070 result->addType(Type::Document); 0071 0072 if (result->inputFlags() & ExtractionResult::ExtractMetaData) { 0073 0074 for (const QString& value : fetchMetadata(ePubDoc, EPUB_TITLE)) { 0075 result->add(Property::Title, value); 0076 } 0077 0078 for (const QString& value : fetchMetadata(ePubDoc, EPUB_SUBJECT)) { 0079 result->add(Property::Subject, value); 0080 } 0081 0082 for (QString value : fetchMetadata(ePubDoc, EPUB_CREATOR)) { 0083 // Prefix added by libepub when no opf:role is specified 0084 if (value.startsWith(QLatin1String("Author: "), Qt::CaseSensitive)) { 0085 value = value.mid(8).simplified(); 0086 } else { 0087 // Find 'opf:role' prefix added by libepub 0088 int index = value.indexOf(QLatin1String(": "), Qt::CaseSensitive); 0089 if (index > 0) { 0090 value = value.mid(index + 2).simplified(); 0091 } 0092 } 0093 0094 // Name is provided as "<name>(<file-as>)" when opf:file-as property 0095 // is specified, "<name>(<name>)" otherwise. Strip the last part 0096 int index = value.indexOf(QLatin1Char('(')); 0097 if (index > 0) { 0098 value = value.mid(0, index); 0099 } 0100 0101 result->add(Property::Author, value); 0102 } 0103 0104 // The Contributor just seems to be mostly Calibre aka the Generator 0105 /* 0106 value = fetchMetadata(ePubDoc, EPUB_CONTRIB); 0107 if( !value.isEmpty() ) { 0108 SimpleResource con; 0109 con.addType( NCO::Contact() ); 0110 con.addProperty( NCO::fullname(), value ); 0111 0112 fileRes.addProperty( NCO::contributor(), con ); 0113 graph << con; 0114 }*/ 0115 0116 for (const QString& value : fetchMetadata(ePubDoc, EPUB_PUBLISHER)) { 0117 result->add(Property::Publisher, value); 0118 } 0119 0120 for (const QString& value : fetchMetadata(ePubDoc, EPUB_DESCRIPTION)) { 0121 result->add(Property::Description, value); 0122 } 0123 0124 for (QString value : fetchMetadata(ePubDoc, EPUB_DATE)) { 0125 if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) { 0126 value = value.mid(12).simplified(); 0127 } else if (value.startsWith(QLatin1String("publication:"), Qt::CaseInsensitive)) { 0128 value = value.mid(12).simplified(); 0129 } else { 0130 continue; 0131 } 0132 QDateTime dt = ExtractorPlugin::dateTimeFromString(value); 0133 if (!dt.isNull()) { 0134 result->add(Property::CreationDate, dt); 0135 result->add(Property::ReleaseYear, dt.date().year()); 0136 } 0137 } 0138 } 0139 0140 // 0141 // Plain Text 0142 // 0143 if (result->inputFlags() & ExtractionResult::ExtractPlainText) { 0144 if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) { 0145 do { 0146 char* curr = epub_it_get_curr(iter); 0147 if (!curr) { 0148 continue; 0149 } 0150 0151 QString html = QString::fromUtf8(curr); 0152 html.remove(QRegularExpression(QStringLiteral("<[^>]*>"))); 0153 result->append(html); 0154 } while (epub_it_get_next(iter)); 0155 0156 epub_free_iterator(iter); 0157 } 0158 0159 auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0); 0160 if (!tit) { 0161 tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0); 0162 } 0163 if (tit) { 0164 if (epub_tit_curr_valid(tit)) { 0165 do { 0166 // get link, iterator handles freeing of it 0167 char* clink = epub_tit_get_curr_link(tit); 0168 0169 // epub_get_data returns -1 on failure 0170 char* data = nullptr; 0171 const int size = epub_get_data(ePubDoc, clink, &data); 0172 if (size >= 0 && data) { 0173 QString html = QString::fromUtf8(data, size); 0174 // strip html tags 0175 html.remove(QRegularExpression(QStringLiteral("<[^>]*>"))); 0176 0177 result->append(html); 0178 free(data); 0179 } 0180 } while (epub_tit_next(tit)); 0181 } 0182 epub_free_titerator(tit); 0183 } 0184 } 0185 0186 // close epub file again 0187 epub_close(ePubDoc); 0188 } 0189 0190 #include "moc_epubextractor.cpp"