File indexing completed on 2024-05-12 05:09:28

0001 /***************************************************************************
0002     Copyright (C) 2016 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "bedethequefetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../utils/string_utils.h"
0028 #include "../utils/isbnvalidator.h"
0029 #include "../collections/comicbookcollection.h"
0030 #include "../entry.h"
0031 #include "../fieldformat.h"
0032 #include "../core/filehandler.h"
0033 #include "../images/imagefactory.h"
0034 #include "../tellico_debug.h"
0035 
0036 #include <KLocalizedString>
0037 #include <KIO/Job>
0038 #include <KJobUiDelegate>
0039 #include <KJobWidgets/KJobWidgets>
0040 
0041 #include <QRegularExpression>
0042 #include <QLabel>
0043 #include <QFile>
0044 #include <QTextStream>
0045 #include <QVBoxLayout>
0046 #include <QUrlQuery>
0047 
0048 namespace {
0049   static const char* BD_BASE_URL = "https://m.bedetheque.com/album";
0050 }
0051 
0052 using namespace Tellico;
0053 using Tellico::Fetch::BedethequeFetcher;
0054 
0055 BedethequeFetcher::BedethequeFetcher(QObject* parent_)
0056     : Fetcher(parent_), m_total(0), m_started(false) {
0057 }
0058 
0059 BedethequeFetcher::~BedethequeFetcher() {
0060 }
0061 
0062 QString BedethequeFetcher::source() const {
0063   return m_name.isEmpty() ? defaultName() : m_name;
0064 }
0065 
0066 Fetch::Type BedethequeFetcher::type() const {
0067   return Bedetheque;
0068 }
0069 
0070 bool BedethequeFetcher::canFetch(int type) const {
0071   return type == Data::Collection::ComicBook;
0072 }
0073 
0074 // No UPC or Raw for now.
0075 bool BedethequeFetcher::canSearch(Fetch::FetchKey k) const {
0076   return k == Title || k == Keyword || k == ISBN;
0077 }
0078 
0079 void BedethequeFetcher::readConfigHook(const KConfigGroup& config_) {
0080   Q_UNUSED(config_);
0081 }
0082 
0083 void BedethequeFetcher::search() {
0084   m_started = true;
0085   m_matches.clear();
0086 
0087   // special case for updates which include the BD link as Raw request
0088   if(request().key() == Raw) {
0089     QUrl u(request().value());
0090     u.setHost(QStringLiteral("m.bedetheque.com")); // use mobile site for easier parsing
0091     m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0092     m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL));
0093     KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0094     // different slot here
0095     connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotLinkComplete);
0096     return;
0097   }
0098 
0099   QUrl u(QString::fromLatin1(BD_BASE_URL));
0100 
0101   QUrlQuery q;
0102   switch(request().key()) {
0103     case Title:
0104       q.addQueryItem(QStringLiteral("RechTitre"), request().value());
0105       break;
0106 
0107     case Keyword:
0108       q.addQueryItem(QStringLiteral("RechSerie"), request().value());
0109       break;
0110 
0111     case ISBN:
0112       q.addQueryItem(QStringLiteral("RechISBN"), ISBNValidator::cleanValue(request().value()));
0113       break;
0114 
0115     default:
0116       myWarning() << source() << "- key not recognized:" << request().key();
0117       stop();
0118       return;
0119   }
0120 //  q.addQueryItem(QLatin1String("csrf_token_bedetheque"), m_token);
0121   u.setQuery(q);
0122 //  myDebug() << "url: " << u.url();
0123 
0124   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0125   m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL));
0126   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0127   connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotComplete);
0128 }
0129 
0130 void BedethequeFetcher::stop() {
0131   if(!m_started) {
0132     return;
0133   }
0134 
0135   if(m_job) {
0136     m_job->kill();
0137     m_job = nullptr;
0138   }
0139   m_started = false;
0140   emit signalDone(this);
0141 }
0142 
0143 void BedethequeFetcher::slotComplete(KJob*) {
0144   if(m_job->error()) {
0145     m_job->uiDelegate()->showErrorMessage();
0146     stop();
0147     return;
0148   }
0149 
0150   QByteArray data = m_job->data();
0151   if(data.isEmpty()) {
0152     myDebug() << "no data";
0153     stop();
0154     return;
0155   }
0156 
0157   // since the fetch is done, don't worry about holding the job pointer
0158   m_job = nullptr;
0159 
0160   QString output = Tellico::decodeHTML(data);
0161 #if 0
0162   myWarning() << "Remove debug from bedethequefetcher.cpp";
0163   QFile f(QString::fromLatin1("/tmp/testbd.html"));
0164   if(f.open(QIODevice::WriteOnly)) {
0165     QTextStream t(&f);
0166     t << output;
0167   }
0168   f.close();
0169 #endif
0170 
0171   const int pos_list = output.indexOf(QLatin1String("<li data-role=\"list-divider\" role=\"heading\">"), 0, Qt::CaseInsensitive);
0172   if(pos_list == -1) {
0173     myDebug() << "No results found";
0174     stop();
0175     return;
0176   }
0177   const int pos_end = output.indexOf(QLatin1String("</ul>"), pos_list+1, Qt::CaseInsensitive);
0178   output = output.mid(pos_list, pos_end-pos_list);
0179 
0180   static const QRegularExpression anchorRx(QLatin1String("<a\\s+?[^>]*?href\\s*?=\\s*?\"(https://m.bedetheque.com/BD.+?)\".*?>(.*?)</a"),
0181                                            QRegularExpression::DotMatchesEverythingOption | QRegularExpression::CaseInsensitiveOption);
0182   static const QRegularExpression spanRx(QLatin1String("\\sclass\\s*?=\\s*?\"(.+?)\">(.+?)<"),
0183                                          QRegularExpression::DotMatchesEverythingOption);
0184 
0185   auto i = anchorRx.globalMatch(output);
0186   while(i.hasNext() && m_started) {
0187     auto match = i.next();
0188     const auto url = match.capturedRef(1);
0189     const auto result = match.capturedRef(2);
0190     if(result.isEmpty()) {
0191       continue;
0192     }
0193 
0194     QString title;
0195     QStringList desc;
0196     auto i2 = spanRx.globalMatch(result);
0197     while(i2.hasNext()) {
0198       auto spanMatch = i2.next();
0199       const auto cname = spanMatch.capturedRef(1);
0200       const auto value = spanMatch.captured(2);
0201       if(cname == QLatin1String("serie")) {
0202         desc += value;
0203       } else if(cname == QLatin1String("titre")) {
0204         title = value;
0205       } else if(cname == QLatin1String("dl")) {
0206         desc += value;
0207       }
0208     }
0209 
0210     if(!title.isEmpty() && !url.isEmpty()) {
0211       FetchResult* r = new FetchResult(this, title, desc.join(QLatin1String(" ")));
0212       m_matches.insert(r->uid, QUrl(url.toString()));
0213       emit signalResultFound(r);
0214     }
0215   }
0216 
0217   stop();
0218 }
0219 
0220 // slot called after downloading the exact link
0221 void BedethequeFetcher::slotLinkComplete(KJob*) {
0222   if(m_job->error()) {
0223     m_job->uiDelegate()->showErrorMessage();
0224     stop();
0225     return;
0226   }
0227   QByteArray data = m_job->data();
0228   if(data.isEmpty()) {
0229     myDebug() << "no data";
0230     stop();
0231     return;
0232   }
0233 
0234   // since the fetch is done, don't worry about holding the job pointer
0235   m_job = nullptr;
0236 
0237   QString output = Tellico::decodeHTML(data);
0238   Data::EntryPtr entry = parseEntry(output);
0239   if(!entry) {
0240     myDebug() << "error in processing entry";
0241     stop();
0242     return;
0243   }
0244 
0245   FetchResult* r = new FetchResult(this, entry);
0246   m_matches.insert(r->uid, QUrl(request().value()));
0247   m_entries.insert(r->uid, entry); // keep for later
0248 
0249   emit signalResultFound(r);
0250   stop();
0251 }
0252 
0253 Tellico::Data::EntryPtr BedethequeFetcher::fetchEntryHook(uint uid_) {
0254   // if we already grabbed this one, then just pull it out of the dict
0255   Data::EntryPtr entry = m_entries[uid_];
0256   if(entry) {
0257     return entry;
0258   }
0259 
0260   QUrl url = m_matches[uid_];
0261   if(url.isEmpty()) {
0262     myWarning() << "no url in map";
0263     return Data::EntryPtr();
0264   }
0265 
0266   QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true));
0267   if(results.isEmpty()) {
0268     myDebug() << "no text results";
0269     return Data::EntryPtr();
0270   }
0271 
0272 //  myDebug() << url.url();
0273 #if 0
0274   myWarning() << "Remove debug from bedethequefetcher.cpp";
0275   QFile f(QLatin1String("/tmp/testbditem.html"));
0276   if(f.open(QIODevice::WriteOnly)) {
0277     QTextStream t(&f);
0278     t.setCodec("UTF-8");
0279     t << results;
0280   }
0281   f.close();
0282 #endif
0283 
0284   entry = parseEntry(results);
0285   if(!entry) {
0286     myDebug() << "error in processing entry";
0287     return Data::EntryPtr();
0288   }
0289   m_entries.insert(uid_, entry); // keep for later
0290   return entry;
0291 }
0292 
0293 Tellico::Data::EntryPtr BedethequeFetcher::parseEntry(const QString& str_) {
0294   Data::CollPtr coll(new Data::ComicBookCollection(true));
0295 
0296  // map captions in HTML to field names
0297   QHash<QString, QString> fieldMap;
0298   fieldMap.insert(QStringLiteral("Série"),       QStringLiteral("series"));
0299   fieldMap.insert(QStringLiteral("Titre"),       QStringLiteral("title"));
0300   fieldMap.insert(QStringLiteral("Origine"),     QStringLiteral("country"));
0301 //  fieldMap.insert(QLatin1String("Format"),       QLatin1String("binding"));
0302   fieldMap.insert(QStringLiteral("Scénario"),    QStringLiteral("writer"));
0303   fieldMap.insert(QStringLiteral("Dessin"),      QStringLiteral("artist"));
0304   fieldMap.insert(QStringLiteral("Dépot légal"), QStringLiteral("pub_year"));
0305   fieldMap.insert(QStringLiteral("Editeur"),     QStringLiteral("publisher"));
0306   fieldMap.insert(QStringLiteral("Planches"),    QStringLiteral("pages"));
0307   fieldMap.insert(QStringLiteral("Style"),       QStringLiteral("genre"));
0308   fieldMap.insert(QStringLiteral("Tome"),        QStringLiteral("issue"));
0309   fieldMap.insert(QStringLiteral("Collection"),  QStringLiteral("edition"));
0310 
0311   if(optionalFields().contains(QStringLiteral("isbn"))) {
0312     Data::FieldPtr field = Data::Field::createDefaultField(Data::Field::IsbnField);
0313     coll->addField(field);
0314     fieldMap.insert(QStringLiteral("ISBN"), field->name());
0315   }
0316   if(optionalFields().contains(QStringLiteral("colorist"))) {
0317     Data::FieldPtr field(new Data::Field(QStringLiteral("colorist"), i18n("Colorist")));
0318     field->setCategory(i18n("General"));
0319     field->setFlags(Data::Field::AllowCompletion | Data::Field::AllowMultiple | Data::Field::AllowGrouped);
0320     field->setFormatType(FieldFormat::FormatName);
0321     coll->addField(field);
0322     fieldMap.insert(QStringLiteral("Couleurs"), QStringLiteral("colorist"));
0323   }
0324   if(optionalFields().contains(QStringLiteral("lien-bel"))) {
0325     Data::FieldPtr field(new Data::Field(QStringLiteral("lien-bel"), i18n("Bedetheque Link"), Data::Field::URL));
0326     field->setCategory(i18n("General"));
0327     coll->addField(field);
0328   }
0329 
0330   static const QRegularExpression tagRx(QLatin1String("<.*?>"));
0331   static const QRegularExpression yearRx(QLatin1String("\\d{4}"));
0332   // the negative lookahead with "no-border" is for multiple values
0333   const QString pat = QStringLiteral("<label>%1.*?</label>(.+?)</li>(?!\\s*<li class=\"no-border)");
0334 
0335   Data::EntryPtr entry(new Data::Entry(coll));
0336 
0337   for(QHash<QString, QString>::Iterator it = fieldMap.begin(); it != fieldMap.end(); ++it) {
0338     const QRegularExpression infoRx(pat.arg(it.key()),
0339                                     QRegularExpression::DotMatchesEverythingOption);
0340     auto match = infoRx.match(str_);
0341     if(!match.hasMatch()) {
0342       continue;
0343     }
0344     if(it.value() == QLatin1String("pub_year")) {
0345       const QString data = match.captured(1).remove(tagRx).simplified();
0346       auto yearMatch = yearRx.match(data);
0347       if(yearMatch.hasMatch()) {
0348         entry->setField(it.value(), yearMatch.captured(0));
0349       }
0350     } else if(it.value() == QLatin1String("writer") ||
0351               it.value() == QLatin1String("artist") ||
0352               it.value() == QLatin1String("publisher") ||
0353               it.value() == QLatin1String("colorist")) {
0354       // catch multiple people
0355       auto value = match.captured(1);
0356       // split the values with the "no-border" CSS
0357       value.replace(QLatin1String("<li class=\"no-border\">"), FieldFormat::delimiterString());
0358       value = FieldFormat::fixupValue(value.remove(tagRx).simplified());
0359       entry->setField(it.value(), value);
0360     } else if(it.value() == QLatin1String("genre")) {
0361       // replace comma with semi-colons to effectively split string values
0362       QString value = match.captured(1).remove(tagRx).simplified();
0363       value.replace(QLatin1String(", "), FieldFormat::delimiterString());
0364       entry->setField(it.value(), value);
0365     } else {
0366       entry->setField(it.value(), match.captured(1).remove(tagRx).simplified());
0367     }
0368     // myDebug() << it.value() << entry->field(it.value());
0369   }
0370 
0371   static const QRegularExpression imgRx(QLatin1String("<img.+?src\\s*=\\s*\"(.+?)\"\\s+alt\\s*=\\s*\"Couverture"));
0372   auto imgMatch = imgRx.match(str_);
0373   if(imgMatch.hasMatch()) {
0374     const QUrl u(imgMatch.captured(1));
0375     const QString id = ImageFactory::addImage(u, true);
0376     if(!id.isEmpty()) {
0377       entry->setField(QStringLiteral("cover"), id);
0378     }
0379   }
0380 
0381   if(optionalFields().contains(QStringLiteral("comments"))) {
0382     static const QRegularExpression chronRx(QLatin1String("La chronique\\s*</li>\\s*<li.*?>(.+?)</ul>"),
0383                                             QRegularExpression::DotMatchesEverythingOption);
0384     auto chronMatch = chronRx.match(str_);
0385     if(chronMatch.hasMatch()) {
0386       entry->setField(QStringLiteral("comments"), chronMatch.captured(1).trimmed());
0387     }
0388   }
0389 
0390   if(optionalFields().contains(QStringLiteral("lien-bel"))) {
0391     static const QRegularExpression linkRx(QLatin1String("<link\\s+rel\\s*=\\s*\"canonical\"\\s+href\\s*=\\s*\"(.+?)\""));
0392     auto linkMatch = linkRx.match(str_);
0393     if(linkMatch.hasMatch()) {
0394       entry->setField(QStringLiteral("lien-bel"), linkMatch.captured(1));
0395     }
0396   }
0397 
0398   return entry;
0399 }
0400 
0401 Tellico::Fetch::FetchRequest BedethequeFetcher::updateRequest(Data::EntryPtr entry_) {
0402   QString l = entry_->field(QStringLiteral("lien-bel"));
0403   if(!l.isEmpty()) {
0404     return FetchRequest(Fetch::Raw, l);
0405   }
0406   QString i = entry_->field(QStringLiteral("isbn"));
0407   if(!i.isEmpty()) {
0408     return FetchRequest(Fetch::ISBN, i);
0409   }
0410   QString t = entry_->field(QStringLiteral("title"));
0411   if(!t.isEmpty()) {
0412     return FetchRequest(Fetch::Title, t);
0413   }
0414   return FetchRequest();
0415 }
0416 
0417 Tellico::Fetch::ConfigWidget* BedethequeFetcher::configWidget(QWidget* parent_) const {
0418   return new BedethequeFetcher::ConfigWidget(parent_, this);
0419 }
0420 
0421 QString BedethequeFetcher::defaultName() {
0422   return QStringLiteral("Bedetheque");
0423 }
0424 
0425 QString BedethequeFetcher::defaultIcon() {
0426   return favIcon("http://www.bedetheque.com");
0427 }
0428 
0429 //static
0430 Tellico::StringHash BedethequeFetcher::allOptionalFields() {
0431   StringHash hash;
0432   hash[QStringLiteral("colorist")]     = i18n("Colorist");
0433   hash[QStringLiteral("comments")]     = i18n("Comments");
0434   hash[QStringLiteral("isbn")]         = i18n("ISBN#");
0435   // use the field name that the bedetheque.py script did, to maintain backwards compatibility
0436   hash[QStringLiteral("lien-bel")]     = i18n("Bedetheque Link");
0437   return hash;
0438 }
0439 
0440 BedethequeFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const BedethequeFetcher* fetcher_)
0441     : Fetch::ConfigWidget(parent_) {
0442   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0443   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0444   l->addStretch();
0445 
0446   // now add additional fields widget
0447   addFieldsWidget(BedethequeFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList());
0448 }
0449 
0450 QString BedethequeFetcher::ConfigWidget::preferredName() const {
0451   return BedethequeFetcher::defaultName();
0452 }