File indexing completed on 2024-05-12 16:45:51

0001 /***************************************************************************
0002     Copyright (C) 2006-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "ibsfetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../utils/string_utils.h"
0028 #include "../collections/bookcollection.h"
0029 #include "../entry.h"
0030 #include "../fieldformat.h"
0031 #include "../core/filehandler.h"
0032 #include "../images/imagefactory.h"
0033 #include "../utils/isbnvalidator.h"
0034 #include "../tellico_debug.h"
0035 
0036 #include <KLocalizedString>
0037 #include <KIO/Job>
0038 #include <KJobUiDelegate>
0039 #include <KJobWidgets/KJobWidgets>
0040 
0041 #include <QRegExp>
0042 #include <QLabel>
0043 #include <QFile>
0044 #include <QTextStream>
0045 #include <QVBoxLayout>
0046 #include <QUrlQuery>
0047 #include <QJsonDocument>
0048 #include <QJsonObject>
0049 
0050 namespace {
0051   static const char* IBS_BASE_URL = "https://www.ibs.it/search/";
0052 }
0053 
0054 using namespace Tellico;
0055 using Tellico::Fetch::IBSFetcher;
0056 
0057 IBSFetcher::IBSFetcher(QObject* parent_)
0058     : Fetcher(parent_), m_total(0), m_started(false) {
0059 }
0060 
0061 IBSFetcher::~IBSFetcher() {
0062 }
0063 
0064 QString IBSFetcher::source() const {
0065   return m_name.isEmpty() ? defaultName() : m_name;
0066 }
0067 
0068 bool IBSFetcher::canFetch(int type) const {
0069   return type == Data::Collection::Book || type == Data::Collection::Bibtex;
0070 }
0071 
0072 // No UPC or Raw for now.
0073 bool IBSFetcher::canSearch(Fetch::FetchKey k) const {
0074   return k == Title || k == Person || k == ISBN;
0075 }
0076 
0077 void IBSFetcher::readConfigHook(const KConfigGroup& config_) {
0078   Q_UNUSED(config_);
0079 }
0080 
0081 void IBSFetcher::search() {
0082   m_started = true;
0083   m_matches.clear();
0084 
0085   QUrl u(QString::fromLatin1(IBS_BASE_URL));
0086   QUrlQuery q;
0087   q.addQueryItem(QStringLiteral("ts"), QStringLiteral("as"));
0088   q.addQueryItem(QStringLiteral("filterProduct_type"), QStringLiteral("ITBOOK"));
0089 
0090   switch(request().key()) {
0091     case Title:
0092       {
0093         // can't have ampersands
0094         QString s = request().value();
0095         s.remove(QLatin1Char('&'));
0096         q.addQueryItem(QStringLiteral("query"), s.simplified());
0097       }
0098       break;
0099 
0100     case ISBN:
0101       {
0102         QString s = request().value();
0103         // limit to first isbn
0104         s = s.section(QLatin1Char(';'), 0, 0);
0105         // isbn13 search doesn't work?
0106         s = ISBNValidator::isbn13(s);
0107         // dashes don't work
0108         s.remove(QLatin1Char('-'));
0109         q.addQueryItem(QStringLiteral("query"), s);
0110       }
0111       break;
0112 
0113     case Keyword:
0114       q.addQueryItem(QStringLiteral("query"), request().value());
0115       break;
0116 
0117     default:
0118       myWarning() << "key not recognized: " << request().key();
0119       stop();
0120       return;
0121   }
0122   u.setQuery(q);
0123 //  myDebug() << "url: " << u.url();
0124 
0125   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0126   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0127   connect(m_job.data(), &KJob::result, this, &IBSFetcher::slotComplete);
0128 }
0129 
0130 void IBSFetcher::stop() {
0131   if(!m_started) {
0132     return;
0133   }
0134 
0135   if(m_job) {
0136     m_job->kill();
0137     m_job = nullptr;
0138   }
0139   m_started = false;
0140   emit signalDone(this);
0141 }
0142 
0143 void IBSFetcher::slotComplete(KJob*) {
0144   if(m_job->error()) {
0145     m_job->uiDelegate()->showErrorMessage();
0146     stop();
0147     return;
0148   }
0149 
0150   QByteArray data = m_job->data();
0151   if(data.isEmpty()) {
0152     myDebug() << "no data";
0153     stop();
0154     return;
0155   }
0156 
0157 #if 0
0158   myWarning() << "Remove debug from ibsfetcher.cpp";
0159   QFile f(QString::fromLatin1("/tmp/test-ibs.html"));
0160   if(f.open(QIODevice::WriteOnly)) {
0161     QTextStream t(&f);
0162     t.setCodec("UTF-8");
0163     t << data;
0164   }
0165   f.close();
0166 #endif
0167 
0168   QString s = Tellico::decodeHTML(data);
0169   QRegularExpression itemRx(QLatin1String("<div class=\"cc-product-list-item.*?>(.+?)<!--"),
0170                             QRegularExpression::DotMatchesEverythingOption);
0171   QRegularExpression titleRx(QStringLiteral("<div class=\"cc-content-title\">\\s*<a [^>]*href=\"(.+?)\"[^>]*?>(.+?)</a"),
0172                              QRegularExpression::DotMatchesEverythingOption);
0173   QRegularExpression yearRx(QLatin1String("<span class=\"cc-owner\">.*?([12]\\d{3}).*?</"),
0174                             QRegularExpression::DotMatchesEverythingOption);
0175   QRegularExpression tagRx(QLatin1String("<.*?>"));
0176 
0177   QString url, title, year;
0178   auto matchIterator = itemRx.globalMatch(s);
0179   while(matchIterator.hasNext() && m_started) {
0180     auto itemMatch = matchIterator.next();
0181     const QString s = itemMatch.captured(1);
0182     auto titleMatch = titleRx.match(s);
0183     if(titleMatch.hasMatch()) {
0184       url = titleMatch.captured(1);
0185       title = titleMatch.captured(2).remove(tagRx).simplified();
0186     }
0187     auto yearMatch = yearRx.match(s);
0188     if(yearMatch.hasMatch()) {
0189       year = yearMatch.captured(1).remove(tagRx).simplified();
0190     }
0191     if(!url.isEmpty() && !title.isEmpty()) {
0192       // the url probable contains &amp; so be careful
0193       QUrl u = m_job->url();
0194       u = u.resolved(QUrl(url.replace(QLatin1String("&amp;"), QLatin1String("&"))));
0195 //      myDebug() << u << title << year;
0196       FetchResult* r = new FetchResult(this, title, year);
0197       m_matches.insert(r->uid, u);
0198       emit signalResultFound(r);
0199     }
0200   }
0201 
0202   // since the fetch is done, don't worry about holding the job pointer
0203   m_job = nullptr;
0204   stop();
0205 }
0206 
0207 Tellico::Data::EntryPtr IBSFetcher::fetchEntryHook(uint uid_) {
0208   // if we already grabbed this one, then just pull it out of the dict
0209   Data::EntryPtr entry = m_entries[uid_];
0210   if(entry) {
0211     return entry;
0212   }
0213 
0214   QUrl url = m_matches[uid_];
0215   if(url.isEmpty()) {
0216     myWarning() << "no url in map";
0217     return Data::EntryPtr();
0218   }
0219 
0220   QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true));
0221   if(results.isEmpty()) {
0222     myDebug() << "no text results";
0223     return Data::EntryPtr();
0224   }
0225 
0226 #if 0
0227   myDebug() << url.url();
0228   myWarning() << "Remove debug2 from ibsfetcher.cpp";
0229   QFile f(QLatin1String("/tmp/test-ibs2.html"));
0230   if(f.open(QIODevice::WriteOnly)) {
0231     QTextStream t(&f);
0232     t.setCodec("UTF-8");
0233     t << results;
0234   }
0235   f.close();
0236 #endif
0237 
0238   entry = parseEntry(results);
0239   if(!entry) {
0240     myDebug() << "error in processing entry";
0241     return Data::EntryPtr();
0242   }
0243   m_entries.insert(uid_, entry); // keep for later
0244   return entry;
0245 }
0246 
0247 Tellico::Data::EntryPtr IBSFetcher::parseEntry(const QString& str_) {
0248   QRegExp jsonRx(QLatin1String("<script type=\"application/ld\\+json\">(.*)</script"));
0249   jsonRx.setMinimal(true);
0250 
0251   if(!str_.contains(jsonRx)) {
0252     myDebug() << "No JSON block";
0253     return Data::EntryPtr();
0254   }
0255 
0256 #if 0
0257   myWarning() << "Remove json debug from ibsfetcher.cpp";
0258   QFile f(QLatin1String("/tmp/test.json"));
0259   if(f.open(QIODevice::WriteOnly)) {
0260     QTextStream t(&f);
0261     t.setCodec("UTF-8");
0262     t << jsonRx.cap(1);
0263   }
0264   f.close();
0265 #endif
0266   QJsonDocument doc = QJsonDocument::fromJson(jsonRx.cap(1).toUtf8());
0267   QVariantMap objectMap = doc.object().toVariantMap();
0268   QVariantMap resultMap = objectMap.value(QStringLiteral("mainEntity")).toMap();
0269   if(resultMap.isEmpty()) {
0270     myDebug() << "no JSON object";
0271     return Data::EntryPtr();
0272   }
0273 
0274   Data::CollPtr coll(new Data::BookCollection(true));
0275   Data::EntryPtr entry(new Data::Entry(coll));
0276 
0277   // as genre, take the last breadcrumb
0278   QString genre = mapValue(objectMap, "breadcrumb");
0279   genre = genre.section(QStringLiteral(">"), -1);
0280   entry->setField(QStringLiteral("genre"), genre);
0281 
0282   // the title in the embedded loses it's identifier? "La..."
0283   entry->setField(QStringLiteral("title"), mapValue(resultMap, "name"));
0284   entry->setField(QStringLiteral("author"), mapValue(resultMap, "author"));
0285 
0286   const QString bookFormat = mapValue(resultMap, "bookFormat");
0287   if(bookFormat == QLatin1String("https://schema.org/Paperback")) {
0288     entry->setField(QStringLiteral("binding"), i18n("Paperback"));
0289   } else if(bookFormat == QLatin1String("https://schema.org/Hardcover")) {
0290     entry->setField(QStringLiteral("binding"), i18n("Hardback"));
0291   } else if(bookFormat == QLatin1String("https://schema.org/EBook")) {
0292     entry->setField(QStringLiteral("binding"), i18n("E-Book"));
0293   }
0294 
0295   entry->setField(QStringLiteral("pub_year"), mapValue(resultMap, "datePublished"));
0296   entry->setField(QStringLiteral("isbn"), mapValue(resultMap, "isbn"));
0297 
0298   const QString id = ImageFactory::addImage(QUrl::fromUserInput(mapValue(resultMap, "image")),
0299                                             true /* quiet */);
0300   if(id.isEmpty()) {
0301     message(i18n("The cover image could not be loaded."), MessageHandler::Warning);
0302   }
0303   // empty image ID is ok
0304   entry->setField(QStringLiteral("cover"), id);
0305 
0306   // inLanguage is upper-case language code
0307   const QString lang = mapValue(resultMap, "inLanguage");
0308   entry->setField(QStringLiteral("language"), QLocale(lang.toLower()).nativeLanguageName());
0309 
0310   entry->setField(QStringLiteral("plot"), mapValue(resultMap, "description"));
0311   entry->setField(QStringLiteral("pages"), mapValue(resultMap, "numberOfPages"));
0312   entry->setField(QStringLiteral("publisher"), mapValue(resultMap, "publisher"));
0313 
0314   // multiple authors do not show up in the embedded JSON
0315   QRegExp titleDivRx(QLatin1String("<div id=\"title\">(.*)</div>"));
0316   titleDivRx.setMinimal(true);
0317   if(str_.contains(titleDivRx)) {
0318     const QString titleDiv = titleDivRx.cap(1);
0319     QRegExp authorRx(QLatin1String("<a href=\"/libri/autori/[^>]+>(.*)</a>"));
0320     authorRx.setMinimal(true);
0321     QStringList authors;
0322     for(int pos = authorRx.indexIn(titleDiv); pos > -1; pos = authorRx.indexIn(titleDiv, pos+authorRx.matchedLength())) {
0323       authors << authorRx.cap(1).simplified();
0324     }
0325     if(!authors.isEmpty()) {
0326       entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString()));
0327     }
0328     // the title in the embedded loses its identifier? "La..."
0329     QRegExp labelRx(QLatin1String("<label>(.*)</label>"));
0330     if(titleDiv.contains(labelRx)) {
0331       entry->setField(QStringLiteral("title"), labelRx.cap(1).simplified());
0332     }
0333   }
0334 
0335   QRegExp tagRx(QLatin1String("<.*>"));
0336   tagRx.setMinimal(true);
0337 
0338   // editor is not in embedded json
0339   QRegExp editorRx(QLatin1String("Curatore:.*>(.*)</a"));
0340   editorRx.setMinimal(true);
0341   if(str_.contains(editorRx)) {
0342     entry->setField(QStringLiteral("editor"), editorRx.cap(1).remove(tagRx).simplified());
0343   }
0344 
0345   // translator is not in embedded json
0346   QRegExp translatorRx(QLatin1String("Traduttore:.*>(.*)</a"));
0347   translatorRx.setMinimal(true);
0348   if(str_.contains(translatorRx)) {
0349     entry->setField(QStringLiteral("translator"), translatorRx.cap(1).remove(tagRx).simplified());
0350   }
0351 
0352   // edition is not in embedded json
0353   QRegExp editionRx(QLatin1String("Editore:.*>(.*)</a"));
0354   editionRx.setMinimal(true);
0355   if(str_.contains(editionRx)) {
0356     entry->setField(QStringLiteral("edition"), editionRx.cap(1).remove(tagRx).simplified());
0357   }
0358 
0359   // series is not in embedded json
0360   QRegExp seriesRx(QLatin1String("Collana:.*>(.*)</a"));
0361   seriesRx.setMinimal(true);
0362   if(str_.contains(seriesRx)) {
0363     entry->setField(QStringLiteral("series"), seriesRx.cap(1).remove(tagRx).simplified());
0364   }
0365 
0366   return entry;
0367 }
0368 
0369 Tellico::Fetch::FetchRequest IBSFetcher::updateRequest(Data::EntryPtr entry_) {
0370   QString isbn = entry_->field(QStringLiteral("isbn"));
0371   if(!isbn.isEmpty()) {
0372     return FetchRequest(Fetch::ISBN, isbn);
0373   }
0374   QString t = entry_->field(QStringLiteral("title"));
0375   if(!t.isEmpty()) {
0376     return FetchRequest(Fetch::Title, t);
0377   }
0378   return FetchRequest();
0379 }
0380 
0381 Tellico::Fetch::ConfigWidget* IBSFetcher::configWidget(QWidget* parent_) const {
0382   return new IBSFetcher::ConfigWidget(parent_);
0383 }
0384 
0385 QString IBSFetcher::defaultName() {
0386   return i18n("Internet Bookshop (ibs.it)");
0387 }
0388 
0389 QString IBSFetcher::defaultIcon() {
0390   return favIcon("http://www.ibs.it");
0391 }
0392 
0393 IBSFetcher::ConfigWidget::ConfigWidget(QWidget* parent_)
0394     : Fetch::ConfigWidget(parent_) {
0395   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0396   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0397   l->addStretch();
0398 }
0399 
0400 QString IBSFetcher::ConfigWidget::preferredName() const {
0401   return IBSFetcher::defaultName();
0402 }