File indexing completed on 2024-05-19 16:18:38
0001 /*************************************************************************** 0002 Copyright (C) 2016 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "bedethequefetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../utils/string_utils.h" 0028 #include "../utils/isbnvalidator.h" 0029 #include "../collections/comicbookcollection.h" 0030 #include "../entry.h" 0031 #include "../fieldformat.h" 0032 #include "../core/filehandler.h" 0033 #include "../images/imagefactory.h" 0034 #include "../tellico_debug.h" 0035 0036 #include <KLocalizedString> 0037 #include <KIO/Job> 0038 #include <KJobUiDelegate> 0039 #include <KJobWidgets/KJobWidgets> 0040 0041 #include <QRegExp> 0042 #include <QLabel> 0043 #include <QFile> 0044 #include <QTextStream> 0045 #include <QVBoxLayout> 0046 #include <QUrlQuery> 0047 0048 namespace { 0049 static const char* BD_BASE_URL = "https://m.bedetheque.com/album"; 0050 } 0051 0052 using namespace Tellico; 0053 using Tellico::Fetch::BedethequeFetcher; 0054 0055 BedethequeFetcher::BedethequeFetcher(QObject* parent_) 0056 : Fetcher(parent_), m_total(0), m_started(false) { 0057 } 0058 0059 BedethequeFetcher::~BedethequeFetcher() { 0060 } 0061 0062 QString BedethequeFetcher::source() const { 0063 return m_name.isEmpty() ? defaultName() : m_name; 0064 } 0065 0066 Fetch::Type BedethequeFetcher::type() const { 0067 return Bedetheque; 0068 } 0069 0070 bool BedethequeFetcher::canFetch(int type) const { 0071 return type == Data::Collection::ComicBook; 0072 } 0073 0074 // No UPC or Raw for now. 0075 bool BedethequeFetcher::canSearch(Fetch::FetchKey k) const { 0076 return k == Title || k == Keyword || k == ISBN; 0077 } 0078 0079 void BedethequeFetcher::readConfigHook(const KConfigGroup& config_) { 0080 Q_UNUSED(config_); 0081 } 0082 0083 void BedethequeFetcher::search() { 0084 m_started = true; 0085 m_matches.clear(); 0086 0087 // special case for updates which include the BD link as Raw request 0088 if(request().key() == Raw) { 0089 QUrl u(request().value()); 0090 u.setHost(QStringLiteral("m.bedetheque.com")); // use mobile site for easier parsing 0091 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0092 m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL)); 0093 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0094 // different slot here 0095 connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotLinkComplete); 0096 return; 0097 } 0098 0099 QUrl u(QString::fromLatin1(BD_BASE_URL)); 0100 0101 /* 0102 fetchToken(); 0103 if(m_token.isEmpty()) { 0104 myDebug() << "empty token"; 0105 stop(); 0106 return; 0107 } 0108 */ 0109 0110 QUrlQuery q; 0111 switch(request().key()) { 0112 case Title: 0113 q.addQueryItem(QStringLiteral("RechTitre"), request().value()); 0114 break; 0115 0116 case Keyword: 0117 q.addQueryItem(QStringLiteral("RechSerie"), request().value()); 0118 break; 0119 0120 case ISBN: 0121 q.addQueryItem(QStringLiteral("RechISBN"), ISBNValidator::cleanValue(request().value())); 0122 break; 0123 0124 default: 0125 myWarning() << "key not recognized: " << request().key(); 0126 stop(); 0127 return; 0128 } 0129 // q.addQueryItem(QLatin1String("csrf_token_bedetheque"), m_token); 0130 u.setQuery(q); 0131 // myDebug() << "url: " << u.url(); 0132 0133 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0134 m_job->addMetaData(QStringLiteral("referrer"), QString::fromLatin1(BD_BASE_URL)); 0135 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0136 connect(m_job.data(), &KJob::result, this, &BedethequeFetcher::slotComplete); 0137 } 0138 0139 void BedethequeFetcher::stop() { 0140 if(!m_started) { 0141 return; 0142 } 0143 0144 if(m_job) { 0145 m_job->kill(); 0146 m_job = nullptr; 0147 } 0148 m_started = false; 0149 emit signalDone(this); 0150 } 0151 0152 void BedethequeFetcher::slotComplete(KJob*) { 0153 if(m_job->error()) { 0154 m_job->uiDelegate()->showErrorMessage(); 0155 stop(); 0156 return; 0157 } 0158 0159 QByteArray data = m_job->data(); 0160 if(data.isEmpty()) { 0161 myDebug() << "no data"; 0162 stop(); 0163 return; 0164 } 0165 0166 // since the fetch is done, don't worry about holding the job pointer 0167 m_job = nullptr; 0168 0169 QString output = Tellico::decodeHTML(data); 0170 #if 0 0171 myWarning() << "Remove debug from bedethequefetcher.cpp"; 0172 QFile f(QString::fromLatin1("/tmp/testbd.html")); 0173 if(f.open(QIODevice::WriteOnly)) { 0174 QTextStream t(&f); 0175 t << output; 0176 } 0177 f.close(); 0178 #endif 0179 0180 const int pos_list = output.indexOf(QLatin1String("<li data-role=\"list-divider\" role=\"heading\">"), 0, Qt::CaseInsensitive); 0181 if(pos_list == -1) { 0182 myDebug() << "No results found"; 0183 stop(); 0184 return; 0185 } 0186 const int pos_end = output.indexOf(QLatin1String("</ul>"), pos_list+1, Qt::CaseInsensitive); 0187 output = output.mid(pos_list, pos_end-pos_list); 0188 0189 QString pat = QStringLiteral("https://m.bedetheque.com/BD"); 0190 QRegExp anchorRx(QLatin1String("<a\\s+[^>]*href\\s*=\\s*[\"'](") + 0191 QRegExp::escape(pat) + 0192 QLatin1String("[^\"']*)\"[^>]*>(.*)</a"), Qt::CaseInsensitive); 0193 anchorRx.setMinimal(true); 0194 0195 QRegExp spanRx(QLatin1String("\\sclass\\s*=\\s*\"(.*)\">(.*)<")); 0196 spanRx.setMinimal(true); 0197 0198 for(int pos = anchorRx.indexIn(output); m_started && pos > -1; pos = anchorRx.indexIn(output, pos+anchorRx.matchedLength())) { 0199 QString url = anchorRx.cap(1); 0200 if(url.isEmpty()) { 0201 continue; 0202 } 0203 0204 const QString result = anchorRx.cap(2); 0205 if(result.isEmpty()) { 0206 continue; 0207 } 0208 0209 QString title; 0210 QStringList desc; 0211 for(int pos2 = spanRx.indexIn(result); pos2 > -1; pos2 = spanRx.indexIn(result, pos2+spanRx.matchedLength())) { 0212 QString cname = spanRx.cap(1); 0213 QString value = spanRx.cap(2); 0214 if(cname == QLatin1String("serie")) { 0215 desc += value; 0216 } else if(cname == QLatin1String("titre")) { 0217 title = value; 0218 } else if(cname == QLatin1String("dl")) { 0219 desc += value; 0220 } 0221 } 0222 0223 if(!title.isEmpty() && !url.isEmpty()) { 0224 FetchResult* r = new FetchResult(this, title, desc.join(QLatin1String(" "))); 0225 m_matches.insert(r->uid, QUrl(url)); 0226 emit signalResultFound(r); 0227 } 0228 } 0229 0230 stop(); 0231 } 0232 0233 // slot called after downloading the exact link 0234 void BedethequeFetcher::slotLinkComplete(KJob*) { 0235 if(m_job->error()) { 0236 m_job->uiDelegate()->showErrorMessage(); 0237 stop(); 0238 return; 0239 } 0240 QByteArray data = m_job->data(); 0241 if(data.isEmpty()) { 0242 myDebug() << "no data"; 0243 stop(); 0244 return; 0245 } 0246 0247 // since the fetch is done, don't worry about holding the job pointer 0248 m_job = nullptr; 0249 0250 QString output = Tellico::decodeHTML(data); 0251 Data::EntryPtr entry = parseEntry(output); 0252 if(!entry) { 0253 myDebug() << "error in processing entry"; 0254 stop(); 0255 return; 0256 } 0257 0258 FetchResult* r = new FetchResult(this, entry); 0259 m_matches.insert(r->uid, QUrl(request().value())); 0260 m_entries.insert(r->uid, entry); // keep for later 0261 0262 emit signalResultFound(r); 0263 stop(); 0264 } 0265 0266 Tellico::Data::EntryPtr BedethequeFetcher::fetchEntryHook(uint uid_) { 0267 // if we already grabbed this one, then just pull it out of the dict 0268 Data::EntryPtr entry = m_entries[uid_]; 0269 if(entry) { 0270 return entry; 0271 } 0272 0273 QUrl url = m_matches[uid_]; 0274 if(url.isEmpty()) { 0275 myWarning() << "no url in map"; 0276 return Data::EntryPtr(); 0277 } 0278 0279 QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true)); 0280 if(results.isEmpty()) { 0281 myDebug() << "no text results"; 0282 return Data::EntryPtr(); 0283 } 0284 0285 // myDebug() << url.url(); 0286 #if 0 0287 myWarning() << "Remove debug from bedethequefetcher.cpp"; 0288 QFile f(QLatin1String("/tmp/testbditem.html")); 0289 if(f.open(QIODevice::WriteOnly)) { 0290 QTextStream t(&f); 0291 t.setCodec("UTF-8"); 0292 t << results; 0293 } 0294 f.close(); 0295 #endif 0296 0297 entry = parseEntry(results); 0298 if(!entry) { 0299 myDebug() << "error in processing entry"; 0300 return Data::EntryPtr(); 0301 } 0302 m_entries.insert(uid_, entry); // keep for later 0303 return entry; 0304 } 0305 0306 Tellico::Data::EntryPtr BedethequeFetcher::parseEntry(const QString& str_) { 0307 Data::CollPtr coll(new Data::ComicBookCollection(true)); 0308 0309 // map captions in HTML to field names 0310 QHash<QString, QString> fieldMap; 0311 fieldMap.insert(QStringLiteral("Série"), QStringLiteral("series")); 0312 fieldMap.insert(QStringLiteral("Titre"), QStringLiteral("title")); 0313 fieldMap.insert(QStringLiteral("Origine"), QStringLiteral("country")); 0314 // fieldMap.insert(QLatin1String("Format"), QLatin1String("binding")); 0315 fieldMap.insert(QStringLiteral("Scénario"), QStringLiteral("writer")); 0316 fieldMap.insert(QStringLiteral("Dessin"), QStringLiteral("artist")); 0317 fieldMap.insert(QStringLiteral("Dépot légal"), QStringLiteral("pub_year")); 0318 fieldMap.insert(QStringLiteral("Editeur"), QStringLiteral("publisher")); 0319 fieldMap.insert(QStringLiteral("Planches"), QStringLiteral("pages")); 0320 fieldMap.insert(QStringLiteral("Style"), QStringLiteral("genre")); 0321 fieldMap.insert(QStringLiteral("Tome"), QStringLiteral("issue")); 0322 fieldMap.insert(QStringLiteral("Collection"), QStringLiteral("edition")); 0323 0324 if(optionalFields().contains(QStringLiteral("isbn"))) { 0325 Data::FieldPtr field = Data::Field::createDefaultField(Data::Field::IsbnField); 0326 coll->addField(field); 0327 fieldMap.insert(QStringLiteral("ISBN"), field->name()); 0328 } 0329 if(optionalFields().contains(QStringLiteral("colorist"))) { 0330 Data::FieldPtr field(new Data::Field(QStringLiteral("colorist"), i18n("Colorist"))); 0331 field->setCategory(i18n("General")); 0332 field->setFlags(Data::Field::AllowCompletion | Data::Field::AllowMultiple | Data::Field::AllowGrouped); 0333 field->setFormatType(FieldFormat::FormatName); 0334 coll->addField(field); 0335 fieldMap.insert(QStringLiteral("Couleurs"), QStringLiteral("colorist")); 0336 } 0337 if(optionalFields().contains(QStringLiteral("lien-bel"))) { 0338 Data::FieldPtr field(new Data::Field(QStringLiteral("lien-bel"), i18n("Bedetheque Link"), Data::Field::URL)); 0339 field->setCategory(i18n("General")); 0340 coll->addField(field); 0341 } 0342 0343 QRegExp tagRx(QLatin1String("<.*>")); 0344 tagRx.setMinimal(true); 0345 0346 QRegExp yearRx(QLatin1String("\\d{4}")); 0347 // the negative lookahead with "no-border" is for multiple values 0348 QString pat = QStringLiteral("<label>%1.*</label>(.+)</li>(?!\\s*<li class=\"no-border)"); 0349 0350 Data::EntryPtr entry(new Data::Entry(coll)); 0351 0352 for(QHash<QString, QString>::Iterator it = fieldMap.begin(); it != fieldMap.end(); ++it) { 0353 QRegExp infoRx(pat.arg(it.key())); 0354 infoRx.setMinimal(true); 0355 if(infoRx.indexIn(str_) == -1) { 0356 continue; 0357 } 0358 if(it.value() == QLatin1String("pub_year")) { 0359 QString data = infoRx.cap(1).remove(tagRx).simplified(); 0360 if(yearRx.indexIn(data) > -1) { 0361 entry->setField(it.value(), yearRx.cap(0)); 0362 } 0363 } else if(it.value() == QLatin1String("writer") || 0364 it.value() == QLatin1String("artist") || 0365 it.value() == QLatin1String("publisher") || 0366 it.value() == QLatin1String("colorist")) { 0367 // catch multiple people 0368 QString value = infoRx.cap(1); 0369 // split the values with the "no-border" CSS 0370 value.replace(QLatin1String("<li class=\"no-border\">"), FieldFormat::delimiterString()); 0371 value = FieldFormat::fixupValue(value.remove(tagRx).simplified()); 0372 entry->setField(it.value(), value); 0373 } else if(it.value() == QLatin1String("genre")) { 0374 // replace comma with semi-colons to effectively split string values 0375 QString value = infoRx.cap(1).remove(tagRx).simplified(); 0376 value.replace(QLatin1String(", "), FieldFormat::delimiterString()); 0377 entry->setField(it.value(), value); 0378 } else { 0379 entry->setField(it.value(), infoRx.cap(1).remove(tagRx).simplified()); 0380 } 0381 // myDebug() << it.value() << entry->field(it.value()); 0382 } 0383 0384 QRegExp imgRx(QLatin1String("<img[^<]*src\\s*=\\s*\"([^\"]+)\"\\s+alt\\s*=\\s*\"Couverture")); 0385 imgRx.setMinimal(true); 0386 if(imgRx.indexIn(str_) > -1) { 0387 QUrl u(imgRx.cap(1)); 0388 QString id = ImageFactory::addImage(u, true); 0389 if(!id.isEmpty()) { 0390 entry->setField(QStringLiteral("cover"), id); 0391 } 0392 } 0393 0394 if(optionalFields().contains(QStringLiteral("comments"))) { 0395 QRegExp chronRx(QLatin1String("La chronique\\s*</li>\\s*<li[^>]*>(.*)</ul>")); 0396 chronRx.setMinimal(true); 0397 if(chronRx.indexIn(str_) > -1) { 0398 entry->setField(QStringLiteral("comments"), chronRx.cap(1).trimmed()); 0399 } 0400 } 0401 0402 if(optionalFields().contains(QStringLiteral("lien-bel"))) { 0403 QRegExp linkRx(QLatin1String("<link\\s+rel\\s*=\\s*\"canonical\"\\s+href\\s*=\\s*\"([^\"]+)\"")); 0404 linkRx.setMinimal(true); 0405 if(linkRx.indexIn(str_) > -1) { 0406 entry->setField(QStringLiteral("lien-bel"), linkRx.cap(1)); 0407 } 0408 } 0409 0410 return entry; 0411 } 0412 0413 Tellico::Fetch::FetchRequest BedethequeFetcher::updateRequest(Data::EntryPtr entry_) { 0414 QString l = entry_->field(QStringLiteral("lien-bel")); 0415 if(!l.isEmpty()) { 0416 return FetchRequest(Fetch::Raw, l); 0417 } 0418 QString i = entry_->field(QStringLiteral("isbn")); 0419 if(!i.isEmpty()) { 0420 return FetchRequest(Fetch::ISBN, i); 0421 } 0422 QString t = entry_->field(QStringLiteral("title")); 0423 if(!t.isEmpty()) { 0424 return FetchRequest(Fetch::Title, t); 0425 } 0426 return FetchRequest(); 0427 } 0428 0429 void BedethequeFetcher::fetchToken() { 0430 QRegExp tokenRx(QLatin1String("name\\s*=\\s*\"csrf_token_bedetheque\"\\s*value\\s*=\\s*\"([^\"]+)\"")); 0431 0432 const QUrl url(QStringLiteral("https://www.bedetheque.com/search/albums")); 0433 const QString text = FileHandler::readTextFile(url, true /*quiet*/); 0434 if(tokenRx.indexIn(text) > -1) { 0435 m_token = tokenRx.cap(1); 0436 } 0437 } 0438 0439 Tellico::Fetch::ConfigWidget* BedethequeFetcher::configWidget(QWidget* parent_) const { 0440 return new BedethequeFetcher::ConfigWidget(parent_, this); 0441 } 0442 0443 QString BedethequeFetcher::defaultName() { 0444 return QStringLiteral("Bedetheque"); 0445 } 0446 0447 QString BedethequeFetcher::defaultIcon() { 0448 return favIcon("http://www.bedetheque.com"); 0449 } 0450 0451 //static 0452 Tellico::StringHash BedethequeFetcher::allOptionalFields() { 0453 StringHash hash; 0454 hash[QStringLiteral("colorist")] = i18n("Colorist"); 0455 hash[QStringLiteral("comments")] = i18n("Comments"); 0456 hash[QStringLiteral("isbn")] = i18n("ISBN#"); 0457 // use the field name that the bedetheque.py script did, to maintain backwards compatibility 0458 hash[QStringLiteral("lien-bel")] = i18n("Bedetheque Link"); 0459 return hash; 0460 } 0461 0462 BedethequeFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const BedethequeFetcher* fetcher_) 0463 : Fetch::ConfigWidget(parent_) { 0464 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0465 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0466 l->addStretch(); 0467 0468 // now add additional fields widget 0469 addFieldsWidget(BedethequeFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList()); 0470 } 0471 0472 QString BedethequeFetcher::ConfigWidget::preferredName() const { 0473 return BedethequeFetcher::defaultName(); 0474 }