File indexing completed on 2024-05-12 05:09:28
0001 /*************************************************************************** 0002 Copyright (C) 2016 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "bedethequefetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../utils/string_utils.h" 0028 #include "../utils/isbnvalidator.h" 0029 #include "../collections/comicbookcollection.h" 0030 #include "../entry.h" 0031 #include "../fieldformat.h" 0032 #include "../core/filehandler.h" 0033 #include "../images/imagefactory.h" 0034 #include "../tellico_debug.h" 0035 0036 #include <KLocalizedString> 0037 #include <KIO/Job> 0038 #include <KJobUiDelegate> 0039 #include <KJobWidgets/KJobWidgets> 0040 0041 #include <QRegularExpression> 0042 #include <QLabel> 0043 #include <QFile> 0044 #include <QTextStream> 0045 #include <QVBoxLayout> 0046 #include <QUrlQuery> 0047 0048 namespace { 0049 static const char* BD_BASE_URL = "https://m.bedetheque.com/album"; 0050 } 0051 0052 using namespace Tellico; 0053 using Tellico::Fetch::BedethequeFetcher; 0054 0055 BedethequeFetcher::BedethequeFetcher(QObject* parent_) 0056 : Fetcher(parent_), m_total(0), m_started(false) { 0057 } 0058 0059 BedethequeFetcher::~BedethequeFetcher() { 0060 } 0061 0062 QString BedethequeFetcher::source() const { 0063 return m_name.isEmpty() ? defaultName() : m_name; 0064 } 0065 0066 Fetch::Type BedethequeFetcher::type() const { 0067 return Bedetheque; 0068 } 0069 0070 bool BedethequeFetcher::canFetch(int type) const { 0071 return type == Data::Collection::ComicBook; 0072 } 0073 0074 // No UPC or Raw for now. 0075 bool BedethequeFetcher::canSearch(Fetch::FetchKey k) const { 0076 return k == Title || k == Keyword || k == ISBN; 0077 } 0078 0079 void BedethequeFetcher::readConfigHook(const KConfigGroup& config_) { 0080 Q_UNUSED(config_); 0081 } 0082 0083 void BedethequeFetcher::search() { 0084 m_started = true; 0085 m_matches.clear(); 0086 0087 // special case for updates which include the BD link as Raw request 0088 if(request().key() == Raw) { 0089 QUrl u(request().value()); 0090 u.setHost(QStringLiteral("m.bedetheque.com")); // use mobile site for easier parsing 0091 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0092 m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL)); 0093 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0094 // different slot here 0095 connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotLinkComplete); 0096 return; 0097 } 0098 0099 QUrl u(QString::fromLatin1(BD_BASE_URL)); 0100 0101 QUrlQuery q; 0102 switch(request().key()) { 0103 case Title: 0104 q.addQueryItem(QStringLiteral("RechTitre"), request().value()); 0105 break; 0106 0107 case Keyword: 0108 q.addQueryItem(QStringLiteral("RechSerie"), request().value()); 0109 break; 0110 0111 case ISBN: 0112 q.addQueryItem(QStringLiteral("RechISBN"), ISBNValidator::cleanValue(request().value())); 0113 break; 0114 0115 default: 0116 myWarning() << source() << "- key not recognized:" << request().key(); 0117 stop(); 0118 return; 0119 } 0120 // q.addQueryItem(QLatin1String("csrf_token_bedetheque"), m_token); 0121 u.setQuery(q); 0122 // myDebug() << "url: " << u.url(); 0123 0124 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0125 m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL)); 0126 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0127 connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotComplete); 0128 } 0129 0130 void BedethequeFetcher::stop() { 0131 if(!m_started) { 0132 return; 0133 } 0134 0135 if(m_job) { 0136 m_job->kill(); 0137 m_job = nullptr; 0138 } 0139 m_started = false; 0140 emit signalDone(this); 0141 } 0142 0143 void BedethequeFetcher::slotComplete(KJob*) { 0144 if(m_job->error()) { 0145 m_job->uiDelegate()->showErrorMessage(); 0146 stop(); 0147 return; 0148 } 0149 0150 QByteArray data = m_job->data(); 0151 if(data.isEmpty()) { 0152 myDebug() << "no data"; 0153 stop(); 0154 return; 0155 } 0156 0157 // since the fetch is done, don't worry about holding the job pointer 0158 m_job = nullptr; 0159 0160 QString output = Tellico::decodeHTML(data); 0161 #if 0 0162 myWarning() << "Remove debug from bedethequefetcher.cpp"; 0163 QFile f(QString::fromLatin1("/tmp/testbd.html")); 0164 if(f.open(QIODevice::WriteOnly)) { 0165 QTextStream t(&f); 0166 t << output; 0167 } 0168 f.close(); 0169 #endif 0170 0171 const int pos_list = output.indexOf(QLatin1String("<li data-role=\"list-divider\" role=\"heading\">"), 0, Qt::CaseInsensitive); 0172 if(pos_list == -1) { 0173 myDebug() << "No results found"; 0174 stop(); 0175 return; 0176 } 0177 const int pos_end = output.indexOf(QLatin1String("</ul>"), pos_list+1, Qt::CaseInsensitive); 0178 output = output.mid(pos_list, pos_end-pos_list); 0179 0180 static const QRegularExpression anchorRx(QLatin1String("<a\\s+?[^>]*?href\\s*?=\\s*?\"(https://m.bedetheque.com/BD.+?)\".*?>(.*?)</a"), 0181 QRegularExpression::DotMatchesEverythingOption | QRegularExpression::CaseInsensitiveOption); 0182 static const QRegularExpression spanRx(QLatin1String("\\sclass\\s*?=\\s*?\"(.+?)\">(.+?)<"), 0183 QRegularExpression::DotMatchesEverythingOption); 0184 0185 auto i = anchorRx.globalMatch(output); 0186 while(i.hasNext() && m_started) { 0187 auto match = i.next(); 0188 const auto url = match.capturedRef(1); 0189 const auto result = match.capturedRef(2); 0190 if(result.isEmpty()) { 0191 continue; 0192 } 0193 0194 QString title; 0195 QStringList desc; 0196 auto i2 = spanRx.globalMatch(result); 0197 while(i2.hasNext()) { 0198 auto spanMatch = i2.next(); 0199 const auto cname = spanMatch.capturedRef(1); 0200 const auto value = spanMatch.captured(2); 0201 if(cname == QLatin1String("serie")) { 0202 desc += value; 0203 } else if(cname == QLatin1String("titre")) { 0204 title = value; 0205 } else if(cname == QLatin1String("dl")) { 0206 desc += value; 0207 } 0208 } 0209 0210 if(!title.isEmpty() && !url.isEmpty()) { 0211 FetchResult* r = new FetchResult(this, title, desc.join(QLatin1String(" "))); 0212 m_matches.insert(r->uid, QUrl(url.toString())); 0213 emit signalResultFound(r); 0214 } 0215 } 0216 0217 stop(); 0218 } 0219 0220 // slot called after downloading the exact link 0221 void BedethequeFetcher::slotLinkComplete(KJob*) { 0222 if(m_job->error()) { 0223 m_job->uiDelegate()->showErrorMessage(); 0224 stop(); 0225 return; 0226 } 0227 QByteArray data = m_job->data(); 0228 if(data.isEmpty()) { 0229 myDebug() << "no data"; 0230 stop(); 0231 return; 0232 } 0233 0234 // since the fetch is done, don't worry about holding the job pointer 0235 m_job = nullptr; 0236 0237 QString output = Tellico::decodeHTML(data); 0238 Data::EntryPtr entry = parseEntry(output); 0239 if(!entry) { 0240 myDebug() << "error in processing entry"; 0241 stop(); 0242 return; 0243 } 0244 0245 FetchResult* r = new FetchResult(this, entry); 0246 m_matches.insert(r->uid, QUrl(request().value())); 0247 m_entries.insert(r->uid, entry); // keep for later 0248 0249 emit signalResultFound(r); 0250 stop(); 0251 } 0252 0253 Tellico::Data::EntryPtr BedethequeFetcher::fetchEntryHook(uint uid_) { 0254 // if we already grabbed this one, then just pull it out of the dict 0255 Data::EntryPtr entry = m_entries[uid_]; 0256 if(entry) { 0257 return entry; 0258 } 0259 0260 QUrl url = m_matches[uid_]; 0261 if(url.isEmpty()) { 0262 myWarning() << "no url in map"; 0263 return Data::EntryPtr(); 0264 } 0265 0266 QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true)); 0267 if(results.isEmpty()) { 0268 myDebug() << "no text results"; 0269 return Data::EntryPtr(); 0270 } 0271 0272 // myDebug() << url.url(); 0273 #if 0 0274 myWarning() << "Remove debug from bedethequefetcher.cpp"; 0275 QFile f(QLatin1String("/tmp/testbditem.html")); 0276 if(f.open(QIODevice::WriteOnly)) { 0277 QTextStream t(&f); 0278 t.setCodec("UTF-8"); 0279 t << results; 0280 } 0281 f.close(); 0282 #endif 0283 0284 entry = parseEntry(results); 0285 if(!entry) { 0286 myDebug() << "error in processing entry"; 0287 return Data::EntryPtr(); 0288 } 0289 m_entries.insert(uid_, entry); // keep for later 0290 return entry; 0291 } 0292 0293 Tellico::Data::EntryPtr BedethequeFetcher::parseEntry(const QString& str_) { 0294 Data::CollPtr coll(new Data::ComicBookCollection(true)); 0295 0296 // map captions in HTML to field names 0297 QHash<QString, QString> fieldMap; 0298 fieldMap.insert(QStringLiteral("Série"), QStringLiteral("series")); 0299 fieldMap.insert(QStringLiteral("Titre"), QStringLiteral("title")); 0300 fieldMap.insert(QStringLiteral("Origine"), QStringLiteral("country")); 0301 // fieldMap.insert(QLatin1String("Format"), QLatin1String("binding")); 0302 fieldMap.insert(QStringLiteral("Scénario"), QStringLiteral("writer")); 0303 fieldMap.insert(QStringLiteral("Dessin"), QStringLiteral("artist")); 0304 fieldMap.insert(QStringLiteral("Dépot légal"), QStringLiteral("pub_year")); 0305 fieldMap.insert(QStringLiteral("Editeur"), QStringLiteral("publisher")); 0306 fieldMap.insert(QStringLiteral("Planches"), QStringLiteral("pages")); 0307 fieldMap.insert(QStringLiteral("Style"), QStringLiteral("genre")); 0308 fieldMap.insert(QStringLiteral("Tome"), QStringLiteral("issue")); 0309 fieldMap.insert(QStringLiteral("Collection"), QStringLiteral("edition")); 0310 0311 if(optionalFields().contains(QStringLiteral("isbn"))) { 0312 Data::FieldPtr field = Data::Field::createDefaultField(Data::Field::IsbnField); 0313 coll->addField(field); 0314 fieldMap.insert(QStringLiteral("ISBN"), field->name()); 0315 } 0316 if(optionalFields().contains(QStringLiteral("colorist"))) { 0317 Data::FieldPtr field(new Data::Field(QStringLiteral("colorist"), i18n("Colorist"))); 0318 field->setCategory(i18n("General")); 0319 field->setFlags(Data::Field::AllowCompletion | Data::Field::AllowMultiple | Data::Field::AllowGrouped); 0320 field->setFormatType(FieldFormat::FormatName); 0321 coll->addField(field); 0322 fieldMap.insert(QStringLiteral("Couleurs"), QStringLiteral("colorist")); 0323 } 0324 if(optionalFields().contains(QStringLiteral("lien-bel"))) { 0325 Data::FieldPtr field(new Data::Field(QStringLiteral("lien-bel"), i18n("Bedetheque Link"), Data::Field::URL)); 0326 field->setCategory(i18n("General")); 0327 coll->addField(field); 0328 } 0329 0330 static const QRegularExpression tagRx(QLatin1String("<.*?>")); 0331 static const QRegularExpression yearRx(QLatin1String("\\d{4}")); 0332 // the negative lookahead with "no-border" is for multiple values 0333 const QString pat = QStringLiteral("<label>%1.*?</label>(.+?)</li>(?!\\s*<li class=\"no-border)"); 0334 0335 Data::EntryPtr entry(new Data::Entry(coll)); 0336 0337 for(QHash<QString, QString>::Iterator it = fieldMap.begin(); it != fieldMap.end(); ++it) { 0338 const QRegularExpression infoRx(pat.arg(it.key()), 0339 QRegularExpression::DotMatchesEverythingOption); 0340 auto match = infoRx.match(str_); 0341 if(!match.hasMatch()) { 0342 continue; 0343 } 0344 if(it.value() == QLatin1String("pub_year")) { 0345 const QString data = match.captured(1).remove(tagRx).simplified(); 0346 auto yearMatch = yearRx.match(data); 0347 if(yearMatch.hasMatch()) { 0348 entry->setField(it.value(), yearMatch.captured(0)); 0349 } 0350 } else if(it.value() == QLatin1String("writer") || 0351 it.value() == QLatin1String("artist") || 0352 it.value() == QLatin1String("publisher") || 0353 it.value() == QLatin1String("colorist")) { 0354 // catch multiple people 0355 auto value = match.captured(1); 0356 // split the values with the "no-border" CSS 0357 value.replace(QLatin1String("<li class=\"no-border\">"), FieldFormat::delimiterString()); 0358 value = FieldFormat::fixupValue(value.remove(tagRx).simplified()); 0359 entry->setField(it.value(), value); 0360 } else if(it.value() == QLatin1String("genre")) { 0361 // replace comma with semi-colons to effectively split string values 0362 QString value = match.captured(1).remove(tagRx).simplified(); 0363 value.replace(QLatin1String(", "), FieldFormat::delimiterString()); 0364 entry->setField(it.value(), value); 0365 } else { 0366 entry->setField(it.value(), match.captured(1).remove(tagRx).simplified()); 0367 } 0368 // myDebug() << it.value() << entry->field(it.value()); 0369 } 0370 0371 static const QRegularExpression imgRx(QLatin1String("<img.+?src\\s*=\\s*\"(.+?)\"\\s+alt\\s*=\\s*\"Couverture")); 0372 auto imgMatch = imgRx.match(str_); 0373 if(imgMatch.hasMatch()) { 0374 const QUrl u(imgMatch.captured(1)); 0375 const QString id = ImageFactory::addImage(u, true); 0376 if(!id.isEmpty()) { 0377 entry->setField(QStringLiteral("cover"), id); 0378 } 0379 } 0380 0381 if(optionalFields().contains(QStringLiteral("comments"))) { 0382 static const QRegularExpression chronRx(QLatin1String("La chronique\\s*</li>\\s*<li.*?>(.+?)</ul>"), 0383 QRegularExpression::DotMatchesEverythingOption); 0384 auto chronMatch = chronRx.match(str_); 0385 if(chronMatch.hasMatch()) { 0386 entry->setField(QStringLiteral("comments"), chronMatch.captured(1).trimmed()); 0387 } 0388 } 0389 0390 if(optionalFields().contains(QStringLiteral("lien-bel"))) { 0391 static const QRegularExpression linkRx(QLatin1String("<link\\s+rel\\s*=\\s*\"canonical\"\\s+href\\s*=\\s*\"(.+?)\"")); 0392 auto linkMatch = linkRx.match(str_); 0393 if(linkMatch.hasMatch()) { 0394 entry->setField(QStringLiteral("lien-bel"), linkMatch.captured(1)); 0395 } 0396 } 0397 0398 return entry; 0399 } 0400 0401 Tellico::Fetch::FetchRequest BedethequeFetcher::updateRequest(Data::EntryPtr entry_) { 0402 QString l = entry_->field(QStringLiteral("lien-bel")); 0403 if(!l.isEmpty()) { 0404 return FetchRequest(Fetch::Raw, l); 0405 } 0406 QString i = entry_->field(QStringLiteral("isbn")); 0407 if(!i.isEmpty()) { 0408 return FetchRequest(Fetch::ISBN, i); 0409 } 0410 QString t = entry_->field(QStringLiteral("title")); 0411 if(!t.isEmpty()) { 0412 return FetchRequest(Fetch::Title, t); 0413 } 0414 return FetchRequest(); 0415 } 0416 0417 Tellico::Fetch::ConfigWidget* BedethequeFetcher::configWidget(QWidget* parent_) const { 0418 return new BedethequeFetcher::ConfigWidget(parent_, this); 0419 } 0420 0421 QString BedethequeFetcher::defaultName() { 0422 return QStringLiteral("Bedetheque"); 0423 } 0424 0425 QString BedethequeFetcher::defaultIcon() { 0426 return favIcon("http://www.bedetheque.com"); 0427 } 0428 0429 //static 0430 Tellico::StringHash BedethequeFetcher::allOptionalFields() { 0431 StringHash hash; 0432 hash[QStringLiteral("colorist")] = i18n("Colorist"); 0433 hash[QStringLiteral("comments")] = i18n("Comments"); 0434 hash[QStringLiteral("isbn")] = i18n("ISBN#"); 0435 // use the field name that the bedetheque.py script did, to maintain backwards compatibility 0436 hash[QStringLiteral("lien-bel")] = i18n("Bedetheque Link"); 0437 return hash; 0438 } 0439 0440 BedethequeFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const BedethequeFetcher* fetcher_) 0441 : Fetch::ConfigWidget(parent_) { 0442 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0443 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0444 l->addStretch(); 0445 0446 // now add additional fields widget 0447 addFieldsWidget(BedethequeFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList()); 0448 } 0449 0450 QString BedethequeFetcher::ConfigWidget::preferredName() const { 0451 return BedethequeFetcher::defaultName(); 0452 }