File indexing completed on 2024-05-19 16:18:38

0001 /***************************************************************************
0002     Copyright (C) 2016 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "bedethequefetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../utils/string_utils.h"
0028 #include "../utils/isbnvalidator.h"
0029 #include "../collections/comicbookcollection.h"
0030 #include "../entry.h"
0031 #include "../fieldformat.h"
0032 #include "../core/filehandler.h"
0033 #include "../images/imagefactory.h"
0034 #include "../tellico_debug.h"
0035 
0036 #include <KLocalizedString>
0037 #include <KIO/Job>
0038 #include <KJobUiDelegate>
0039 #include <KJobWidgets/KJobWidgets>
0040 
0041 #include <QRegExp>
0042 #include <QLabel>
0043 #include <QFile>
0044 #include <QTextStream>
0045 #include <QVBoxLayout>
0046 #include <QUrlQuery>
0047 
0048 namespace {
0049   static const char* BD_BASE_URL = "https://m.bedetheque.com/album";
0050 }
0051 
0052 using namespace Tellico;
0053 using Tellico::Fetch::BedethequeFetcher;
0054 
0055 BedethequeFetcher::BedethequeFetcher(QObject* parent_)
0056     : Fetcher(parent_), m_total(0), m_started(false) {
0057 }
0058 
0059 BedethequeFetcher::~BedethequeFetcher() {
0060 }
0061 
0062 QString BedethequeFetcher::source() const {
0063   return m_name.isEmpty() ? defaultName() : m_name;
0064 }
0065 
0066 Fetch::Type BedethequeFetcher::type() const {
0067   return Bedetheque;
0068 }
0069 
0070 bool BedethequeFetcher::canFetch(int type) const {
0071   return type == Data::Collection::ComicBook;
0072 }
0073 
0074 // No UPC or Raw for now.
0075 bool BedethequeFetcher::canSearch(Fetch::FetchKey k) const {
0076   return k == Title || k == Keyword || k == ISBN;
0077 }
0078 
0079 void BedethequeFetcher::readConfigHook(const KConfigGroup& config_) {
0080   Q_UNUSED(config_);
0081 }
0082 
0083 void BedethequeFetcher::search() {
0084   m_started = true;
0085   m_matches.clear();
0086 
0087   // special case for updates which include the BD link as Raw request
0088   if(request().key() == Raw) {
0089     QUrl u(request().value());
0090     u.setHost(QStringLiteral("m.bedetheque.com")); // use mobile site for easier parsing
0091     m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0092     m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL));
0093     KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0094     // different slot here
0095     connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotLinkComplete);
0096     return;
0097   }
0098 
0099   QUrl u(QString::fromLatin1(BD_BASE_URL));
0100 
0101 /*
0102   fetchToken();
0103   if(m_token.isEmpty()) {
0104     myDebug() << "empty token";
0105     stop();
0106     return;
0107   }
0108 */
0109 
0110   QUrlQuery q;
0111   switch(request().key()) {
0112     case Title:
0113       q.addQueryItem(QStringLiteral("RechTitre"), request().value());
0114       break;
0115 
0116     case Keyword:
0117       q.addQueryItem(QStringLiteral("RechSerie"), request().value());
0118       break;
0119 
0120     case ISBN:
0121       q.addQueryItem(QStringLiteral("RechISBN"), ISBNValidator::cleanValue(request().value()));
0122       break;
0123 
0124     default:
0125       myWarning() << "key not recognized: " << request().key();
0126       stop();
0127       return;
0128   }
0129 //  q.addQueryItem(QLatin1String("csrf_token_bedetheque"), m_token);
0130   u.setQuery(q);
0131 //  myDebug() << "url: " << u.url();
0132 
0133   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0134   m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL));
0135   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0136   connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotComplete);
0137 }
0138 
0139 void BedethequeFetcher::stop() {
0140   if(!m_started) {
0141     return;
0142   }
0143 
0144   if(m_job) {
0145     m_job->kill();
0146     m_job = nullptr;
0147   }
0148   m_started = false;
0149   emit signalDone(this);
0150 }
0151 
0152 void BedethequeFetcher::slotComplete(KJob*) {
0153   if(m_job->error()) {
0154     m_job->uiDelegate()->showErrorMessage();
0155     stop();
0156     return;
0157   }
0158 
0159   QByteArray data = m_job->data();
0160   if(data.isEmpty()) {
0161     myDebug() << "no data";
0162     stop();
0163     return;
0164   }
0165 
0166   // since the fetch is done, don't worry about holding the job pointer
0167   m_job = nullptr;
0168 
0169   QString output = Tellico::decodeHTML(data);
0170 #if 0
0171   myWarning() << "Remove debug from bedethequefetcher.cpp";
0172   QFile f(QString::fromLatin1("/tmp/testbd.html"));
0173   if(f.open(QIODevice::WriteOnly)) {
0174     QTextStream t(&f);
0175     t << output;
0176   }
0177   f.close();
0178 #endif
0179 
0180   const int pos_list = output.indexOf(QLatin1String("<li data-role=\"list-divider\" role=\"heading\">"), 0, Qt::CaseInsensitive);
0181   if(pos_list == -1) {
0182     myDebug() << "No results found";
0183     stop();
0184     return;
0185   }
0186   const int pos_end = output.indexOf(QLatin1String("</ul>"), pos_list+1, Qt::CaseInsensitive);
0187   output = output.mid(pos_list, pos_end-pos_list);
0188 
0189   QString pat = QStringLiteral("https://m.bedetheque.com/BD");
0190   QRegExp anchorRx(QLatin1String("<a\\s+[^>]*href\\s*=\\s*[\"'](") +
0191                    QRegExp::escape(pat) +
0192                    QLatin1String("[^\"']*)\"[^>]*>(.*)</a"), Qt::CaseInsensitive);
0193   anchorRx.setMinimal(true);
0194 
0195   QRegExp spanRx(QLatin1String("\\sclass\\s*=\\s*\"(.*)\">(.*)<"));
0196   spanRx.setMinimal(true);
0197 
0198   for(int pos = anchorRx.indexIn(output); m_started && pos > -1; pos = anchorRx.indexIn(output, pos+anchorRx.matchedLength())) {
0199     QString url = anchorRx.cap(1);
0200     if(url.isEmpty()) {
0201       continue;
0202     }
0203 
0204     const QString result = anchorRx.cap(2);
0205     if(result.isEmpty()) {
0206       continue;
0207     }
0208 
0209     QString title;
0210     QStringList desc;
0211     for(int pos2 = spanRx.indexIn(result); pos2 > -1; pos2 = spanRx.indexIn(result, pos2+spanRx.matchedLength())) {
0212       QString cname = spanRx.cap(1);
0213       QString value = spanRx.cap(2);
0214       if(cname == QLatin1String("serie")) {
0215         desc += value;
0216       } else if(cname == QLatin1String("titre")) {
0217         title = value;
0218       } else if(cname == QLatin1String("dl")) {
0219         desc += value;
0220       }
0221     }
0222 
0223     if(!title.isEmpty() && !url.isEmpty()) {
0224       FetchResult* r = new FetchResult(this, title, desc.join(QLatin1String(" ")));
0225       m_matches.insert(r->uid, QUrl(url));
0226       emit signalResultFound(r);
0227     }
0228   }
0229 
0230   stop();
0231 }
0232 
0233 // slot called after downloading the exact link
0234 void BedethequeFetcher::slotLinkComplete(KJob*) {
0235   if(m_job->error()) {
0236     m_job->uiDelegate()->showErrorMessage();
0237     stop();
0238     return;
0239   }
0240   QByteArray data = m_job->data();
0241   if(data.isEmpty()) {
0242     myDebug() << "no data";
0243     stop();
0244     return;
0245   }
0246 
0247   // since the fetch is done, don't worry about holding the job pointer
0248   m_job = nullptr;
0249 
0250   QString output = Tellico::decodeHTML(data);
0251   Data::EntryPtr entry = parseEntry(output);
0252   if(!entry) {
0253     myDebug() << "error in processing entry";
0254     stop();
0255     return;
0256   }
0257 
0258   FetchResult* r = new FetchResult(this, entry);
0259   m_matches.insert(r->uid, QUrl(request().value()));
0260   m_entries.insert(r->uid, entry); // keep for later
0261 
0262   emit signalResultFound(r);
0263   stop();
0264 }
0265 
0266 Tellico::Data::EntryPtr BedethequeFetcher::fetchEntryHook(uint uid_) {
0267   // if we already grabbed this one, then just pull it out of the dict
0268   Data::EntryPtr entry = m_entries[uid_];
0269   if(entry) {
0270     return entry;
0271   }
0272 
0273   QUrl url = m_matches[uid_];
0274   if(url.isEmpty()) {
0275     myWarning() << "no url in map";
0276     return Data::EntryPtr();
0277   }
0278 
0279   QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true));
0280   if(results.isEmpty()) {
0281     myDebug() << "no text results";
0282     return Data::EntryPtr();
0283   }
0284 
0285 //  myDebug() << url.url();
0286 #if 0
0287   myWarning() << "Remove debug from bedethequefetcher.cpp";
0288   QFile f(QLatin1String("/tmp/testbditem.html"));
0289   if(f.open(QIODevice::WriteOnly)) {
0290     QTextStream t(&f);
0291     t.setCodec("UTF-8");
0292     t << results;
0293   }
0294   f.close();
0295 #endif
0296 
0297   entry = parseEntry(results);
0298   if(!entry) {
0299     myDebug() << "error in processing entry";
0300     return Data::EntryPtr();
0301   }
0302   m_entries.insert(uid_, entry); // keep for later
0303   return entry;
0304 }
0305 
0306 Tellico::Data::EntryPtr BedethequeFetcher::parseEntry(const QString& str_) {
0307   Data::CollPtr coll(new Data::ComicBookCollection(true));
0308 
0309  // map captions in HTML to field names
0310   QHash<QString, QString> fieldMap;
0311   fieldMap.insert(QStringLiteral("Série"),       QStringLiteral("series"));
0312   fieldMap.insert(QStringLiteral("Titre"),           QStringLiteral("title"));
0313   fieldMap.insert(QStringLiteral("Origine"),         QStringLiteral("country"));
0314 //  fieldMap.insert(QLatin1String("Format"),          QLatin1String("binding"));
0315   fieldMap.insert(QStringLiteral("Scénario"),    QStringLiteral("writer"));
0316   fieldMap.insert(QStringLiteral("Dessin"),          QStringLiteral("artist"));
0317   fieldMap.insert(QStringLiteral("Dépot légal"), QStringLiteral("pub_year"));
0318   fieldMap.insert(QStringLiteral("Editeur"),         QStringLiteral("publisher"));
0319   fieldMap.insert(QStringLiteral("Planches"),        QStringLiteral("pages"));
0320   fieldMap.insert(QStringLiteral("Style"),           QStringLiteral("genre"));
0321   fieldMap.insert(QStringLiteral("Tome"),            QStringLiteral("issue"));
0322   fieldMap.insert(QStringLiteral("Collection"),      QStringLiteral("edition"));
0323 
0324   if(optionalFields().contains(QStringLiteral("isbn"))) {
0325     Data::FieldPtr field = Data::Field::createDefaultField(Data::Field::IsbnField);
0326     coll->addField(field);
0327     fieldMap.insert(QStringLiteral("ISBN"), field->name());
0328   }
0329   if(optionalFields().contains(QStringLiteral("colorist"))) {
0330     Data::FieldPtr field(new Data::Field(QStringLiteral("colorist"), i18n("Colorist")));
0331     field->setCategory(i18n("General"));
0332     field->setFlags(Data::Field::AllowCompletion | Data::Field::AllowMultiple | Data::Field::AllowGrouped);
0333     field->setFormatType(FieldFormat::FormatName);
0334     coll->addField(field);
0335     fieldMap.insert(QStringLiteral("Couleurs"), QStringLiteral("colorist"));
0336   }
0337   if(optionalFields().contains(QStringLiteral("lien-bel"))) {
0338     Data::FieldPtr field(new Data::Field(QStringLiteral("lien-bel"), i18n("Bedetheque Link"), Data::Field::URL));
0339     field->setCategory(i18n("General"));
0340     coll->addField(field);
0341   }
0342 
0343   QRegExp tagRx(QLatin1String("<.*>"));
0344   tagRx.setMinimal(true);
0345 
0346   QRegExp yearRx(QLatin1String("\\d{4}"));
0347   // the negative lookahead with "no-border" is for multiple values
0348   QString pat = QStringLiteral("<label>%1.*</label>(.+)</li>(?!\\s*<li class=\"no-border)");
0349 
0350   Data::EntryPtr entry(new Data::Entry(coll));
0351 
0352   for(QHash<QString, QString>::Iterator it = fieldMap.begin(); it != fieldMap.end(); ++it) {
0353     QRegExp infoRx(pat.arg(it.key()));
0354     infoRx.setMinimal(true);
0355     if(infoRx.indexIn(str_) == -1) {
0356       continue;
0357     }
0358     if(it.value() == QLatin1String("pub_year")) {
0359       QString data = infoRx.cap(1).remove(tagRx).simplified();
0360       if(yearRx.indexIn(data) > -1) {
0361         entry->setField(it.value(), yearRx.cap(0));
0362       }
0363     } else if(it.value() == QLatin1String("writer") ||
0364               it.value() == QLatin1String("artist") ||
0365               it.value() == QLatin1String("publisher") ||
0366               it.value() == QLatin1String("colorist")) {
0367       // catch multiple people
0368       QString value = infoRx.cap(1);
0369       // split the values with the "no-border" CSS
0370       value.replace(QLatin1String("<li class=\"no-border\">"), FieldFormat::delimiterString());
0371       value = FieldFormat::fixupValue(value.remove(tagRx).simplified());
0372       entry->setField(it.value(), value);
0373     } else if(it.value() == QLatin1String("genre")) {
0374       // replace comma with semi-colons to effectively split string values
0375       QString value = infoRx.cap(1).remove(tagRx).simplified();
0376       value.replace(QLatin1String(", "), FieldFormat::delimiterString());
0377       entry->setField(it.value(), value);
0378     } else {
0379       entry->setField(it.value(), infoRx.cap(1).remove(tagRx).simplified());
0380     }
0381     // myDebug() << it.value() << entry->field(it.value());
0382   }
0383 
0384   QRegExp imgRx(QLatin1String("<img[^<]*src\\s*=\\s*\"([^\"]+)\"\\s+alt\\s*=\\s*\"Couverture"));
0385   imgRx.setMinimal(true);
0386   if(imgRx.indexIn(str_) > -1) {
0387     QUrl u(imgRx.cap(1));
0388     QString id = ImageFactory::addImage(u, true);
0389     if(!id.isEmpty()) {
0390       entry->setField(QStringLiteral("cover"), id);
0391     }
0392   }
0393 
0394   if(optionalFields().contains(QStringLiteral("comments"))) {
0395     QRegExp chronRx(QLatin1String("La chronique\\s*</li>\\s*<li[^>]*>(.*)</ul>"));
0396     chronRx.setMinimal(true);
0397     if(chronRx.indexIn(str_) > -1) {
0398       entry->setField(QStringLiteral("comments"), chronRx.cap(1).trimmed());
0399     }
0400   }
0401 
0402   if(optionalFields().contains(QStringLiteral("lien-bel"))) {
0403     QRegExp linkRx(QLatin1String("<link\\s+rel\\s*=\\s*\"canonical\"\\s+href\\s*=\\s*\"([^\"]+)\""));
0404     linkRx.setMinimal(true);
0405     if(linkRx.indexIn(str_) > -1) {
0406       entry->setField(QStringLiteral("lien-bel"), linkRx.cap(1));
0407     }
0408   }
0409 
0410   return entry;
0411 }
0412 
0413 Tellico::Fetch::FetchRequest BedethequeFetcher::updateRequest(Data::EntryPtr entry_) {
0414   QString l = entry_->field(QStringLiteral("lien-bel"));
0415   if(!l.isEmpty()) {
0416     return FetchRequest(Fetch::Raw, l);
0417   }
0418   QString i = entry_->field(QStringLiteral("isbn"));
0419   if(!i.isEmpty()) {
0420     return FetchRequest(Fetch::ISBN, i);
0421   }
0422   QString t = entry_->field(QStringLiteral("title"));
0423   if(!t.isEmpty()) {
0424     return FetchRequest(Fetch::Title, t);
0425   }
0426   return FetchRequest();
0427 }
0428 
0429 void BedethequeFetcher::fetchToken() {
0430   QRegExp tokenRx(QLatin1String("name\\s*=\\s*\"csrf_token_bedetheque\"\\s*value\\s*=\\s*\"([^\"]+)\""));
0431 
0432   const QUrl url(QStringLiteral("https://www.bedetheque.com/search/albums"));
0433   const QString text = FileHandler::readTextFile(url, true /*quiet*/);
0434   if(tokenRx.indexIn(text) > -1) {
0435     m_token = tokenRx.cap(1);
0436   }
0437 }
0438 
0439 Tellico::Fetch::ConfigWidget* BedethequeFetcher::configWidget(QWidget* parent_) const {
0440   return new BedethequeFetcher::ConfigWidget(parent_, this);
0441 }
0442 
0443 QString BedethequeFetcher::defaultName() {
0444   return QStringLiteral("Bedetheque");
0445 }
0446 
0447 QString BedethequeFetcher::defaultIcon() {
0448   return favIcon("http://www.bedetheque.com");
0449 }
0450 
0451 //static
0452 Tellico::StringHash BedethequeFetcher::allOptionalFields() {
0453   StringHash hash;
0454   hash[QStringLiteral("colorist")]     = i18n("Colorist");
0455   hash[QStringLiteral("comments")]     = i18n("Comments");
0456   hash[QStringLiteral("isbn")]         = i18n("ISBN#");
0457   // use the field name that the bedetheque.py script did, to maintain backwards compatibility
0458   hash[QStringLiteral("lien-bel")]     = i18n("Bedetheque Link");
0459   return hash;
0460 }
0461 
0462 BedethequeFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const BedethequeFetcher* fetcher_)
0463     : Fetch::ConfigWidget(parent_) {
0464   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0465   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0466   l->addStretch();
0467 
0468   // now add additional fields widget
0469   addFieldsWidget(BedethequeFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList());
0470 }
0471 
0472 QString BedethequeFetcher::ConfigWidget::preferredName() const {
0473   return BedethequeFetcher::defaultName();
0474 }