File indexing completed on 2024-05-12 05:09:35

0001 /***************************************************************************
0002     Copyright (C) 2006-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "ibsfetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../collections/bookcollection.h"
0028 #include "../entry.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../images/imagefactory.h"
0032 #include "../utils/isbnvalidator.h"
0033 #include "../utils/string_utils.h"
0034 #include "../utils/mapvalue.h"
0035 #include "../tellico_debug.h"
0036 
0037 #include <KLocalizedString>
0038 #include <KIO/Job>
0039 #include <KJobUiDelegate>
0040 #include <KJobWidgets/KJobWidgets>
0041 
0042 #include <QRegularExpression>
0043 #include <QLabel>
0044 #include <QFile>
0045 #include <QTextStream>
0046 #include <QVBoxLayout>
0047 #include <QUrlQuery>
0048 #include <QJsonDocument>
0049 #include <QJsonObject>
0050 
0051 namespace {
0052   static const char* IBS_BASE_URL = "https://www.ibs.it/search/";
0053 }
0054 
0055 using namespace Tellico;
0056 using Tellico::Fetch::IBSFetcher;
0057 
0058 IBSFetcher::IBSFetcher(QObject* parent_)
0059     : Fetcher(parent_), m_total(0), m_started(false) {
0060 }
0061 
0062 IBSFetcher::~IBSFetcher() {
0063 }
0064 
0065 QString IBSFetcher::source() const {
0066   return m_name.isEmpty() ? defaultName() : m_name;
0067 }
0068 
0069 bool IBSFetcher::canFetch(int type) const {
0070   return type == Data::Collection::Book || type == Data::Collection::Bibtex;
0071 }
0072 
0073 // No UPC or Raw for now.
0074 bool IBSFetcher::canSearch(Fetch::FetchKey k) const {
0075   return k == Title || k == Person || k == ISBN;
0076 }
0077 
0078 void IBSFetcher::readConfigHook(const KConfigGroup& config_) {
0079   Q_UNUSED(config_);
0080 }
0081 
0082 void IBSFetcher::search() {
0083   m_started = true;
0084   m_matches.clear();
0085 
0086   QUrl u(QString::fromLatin1(IBS_BASE_URL));
0087   QUrlQuery q;
0088   q.addQueryItem(QStringLiteral("ts"), QStringLiteral("as"));
0089   q.addQueryItem(QStringLiteral("filterProduct_type"), QStringLiteral("ITBOOK"));
0090 
0091   switch(request().key()) {
0092     case Title:
0093       {
0094         // can't have ampersands
0095         QString s = request().value();
0096         s.remove(QLatin1Char('&'));
0097         q.addQueryItem(QStringLiteral("query"), s.simplified());
0098       }
0099       break;
0100 
0101     case ISBN:
0102       {
0103         QString s = request().value();
0104         // limit to first isbn
0105         s = s.section(QLatin1Char(';'), 0, 0);
0106         // isbn13 search doesn't work?
0107         s = ISBNValidator::isbn13(s);
0108         // dashes don't work
0109         s.remove(QLatin1Char('-'));
0110         q.addQueryItem(QStringLiteral("query"), s);
0111       }
0112       break;
0113 
0114     case Keyword:
0115       q.addQueryItem(QStringLiteral("query"), request().value());
0116       break;
0117 
0118     default:
0119       myWarning() << source() << "- key not recognized:" << request().key();
0120       stop();
0121       return;
0122   }
0123   u.setQuery(q);
0124 //  myDebug() << "url: " << u.url();
0125 
0126   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0127   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0128   connect(m_job.data(), &KJob::result, this, &IBSFetcher::slotComplete);
0129 }
0130 
0131 void IBSFetcher::stop() {
0132   if(!m_started) {
0133     return;
0134   }
0135 
0136   if(m_job) {
0137     m_job->kill();
0138     m_job = nullptr;
0139   }
0140   m_started = false;
0141   emit signalDone(this);
0142 }
0143 
0144 void IBSFetcher::slotComplete(KJob*) {
0145   if(m_job->error()) {
0146     m_job->uiDelegate()->showErrorMessage();
0147     stop();
0148     return;
0149   }
0150 
0151   QByteArray data = m_job->data();
0152   if(data.isEmpty()) {
0153     myDebug() << "no data";
0154     stop();
0155     return;
0156   }
0157 
0158 #if 0
0159   myWarning() << "Remove debug from ibsfetcher.cpp";
0160   QFile f(QString::fromLatin1("/tmp/test-ibs.html"));
0161   if(f.open(QIODevice::WriteOnly)) {
0162     QTextStream t(&f);
0163     t.setCodec("UTF-8");
0164     t << data;
0165   }
0166   f.close();
0167 #endif
0168 
0169   QString s = Tellico::decodeHTML(data);
0170   static const QRegularExpression itemRx(QLatin1String("<div class=\"cc-product-list-item.*?>(.+?)<!--"),
0171                                          QRegularExpression::DotMatchesEverythingOption);
0172   static const QRegularExpression titleRx(QStringLiteral("<a [^>]*href=\"(.+?)\"[^>]*?class=\"cc-title\">(.+?)</a"),
0173                                           QRegularExpression::DotMatchesEverythingOption);
0174   static const QRegularExpression yearRx(QLatin1String("<span class=\"cc-publisher\">.*?([12]\\d{3}).*?</"),
0175                                          QRegularExpression::DotMatchesEverythingOption);
0176   static const QRegularExpression tagRx(QLatin1String("<.*?>"));
0177 
0178   QString url, title, year;
0179   auto matchIterator = itemRx.globalMatch(s);
0180   while(matchIterator.hasNext() && m_started) {
0181     auto itemMatch = matchIterator.next();
0182     const QString s = itemMatch.captured(1);
0183     auto titleMatch = titleRx.match(s);
0184     if(titleMatch.hasMatch()) {
0185       url = titleMatch.captured(1);
0186       title = titleMatch.captured(2).remove(tagRx).simplified();
0187     }
0188     auto yearMatch = yearRx.match(s);
0189     if(yearMatch.hasMatch()) {
0190       year = yearMatch.captured(1).remove(tagRx).simplified();
0191     }
0192     if(!url.isEmpty() && !title.isEmpty()) {
0193       // the url probable contains &amp; so be careful
0194       QUrl u = m_job->url();
0195       u = u.resolved(QUrl(url.replace(QLatin1String("&amp;"), QLatin1String("&"))));
0196 //      myDebug() << u << title << year;
0197       FetchResult* r = new FetchResult(this, title, year);
0198       m_matches.insert(r->uid, u);
0199       emit signalResultFound(r);
0200     }
0201   }
0202 
0203   // since the fetch is done, don't worry about holding the job pointer
0204   m_job = nullptr;
0205   stop();
0206 }
0207 
0208 Tellico::Data::EntryPtr IBSFetcher::fetchEntryHook(uint uid_) {
0209   // if we already grabbed this one, then just pull it out of the dict
0210   Data::EntryPtr entry = m_entries[uid_];
0211   if(entry) {
0212     return entry;
0213   }
0214 
0215   QUrl url = m_matches[uid_];
0216   if(url.isEmpty()) {
0217     myWarning() << "no url in map";
0218     return Data::EntryPtr();
0219   }
0220 
0221   QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true));
0222   if(results.isEmpty()) {
0223     myDebug() << "no text results";
0224     return Data::EntryPtr();
0225   }
0226 
0227 #if 0
0228   myDebug() << url.url();
0229   myWarning() << "Remove debug2 from ibsfetcher.cpp";
0230   QFile f(QLatin1String("/tmp/test-ibs2.html"));
0231   if(f.open(QIODevice::WriteOnly)) {
0232     QTextStream t(&f);
0233     t.setCodec("UTF-8");
0234     t << results;
0235   }
0236   f.close();
0237 #endif
0238 
0239   entry = parseEntry(results);
0240   if(!entry) {
0241     myDebug() << "error in processing entry";
0242     return Data::EntryPtr();
0243   }
0244   m_entries.insert(uid_, entry); // keep for later
0245   return entry;
0246 }
0247 
0248 Tellico::Data::EntryPtr IBSFetcher::parseEntry(const QString& str_) {
0249   static const QRegularExpression jsonRx(QLatin1String("<script type=\"application/ld\\+json\">(.*?)</script"),
0250                                          QRegularExpression::DotMatchesEverythingOption);
0251 
0252   const auto jsonMatch = jsonRx.match(str_);
0253   if(!jsonMatch.hasMatch()) {
0254     myDebug() << "No JSON block";
0255     return Data::EntryPtr();
0256   }
0257 
0258 #if 0
0259   myWarning() << "Remove json debug from ibsfetcher.cpp";
0260   QFile f(QLatin1String("/tmp/test.json"));
0261   if(f.open(QIODevice::WriteOnly)) {
0262     QTextStream t(&f);
0263     t.setCodec("UTF-8");
0264     t << jsonRx.cap(1);
0265   }
0266   f.close();
0267 #endif
0268   QJsonDocument doc = QJsonDocument::fromJson(jsonMatch.capturedRef(1).toUtf8());
0269   QVariantMap objectMap = doc.object().toVariantMap();
0270   QVariantMap resultMap = objectMap.value(QStringLiteral("mainEntity")).toMap();
0271   if(resultMap.isEmpty()) {
0272     myDebug() << "no JSON object";
0273     return Data::EntryPtr();
0274   }
0275 
0276   Data::CollPtr coll(new Data::BookCollection(true));
0277   Data::EntryPtr entry(new Data::Entry(coll));
0278 
0279   // as genre, take the last breadcrumb
0280   QString genre = mapValue(objectMap, "breadcrumb");
0281   genre = genre.section(QStringLiteral(">"), -1);
0282   entry->setField(QStringLiteral("genre"), genre);
0283 
0284   // the title in the embedded loses it's identifier? "La..."
0285   entry->setField(QStringLiteral("title"), mapValue(resultMap, "name"));
0286   entry->setField(QStringLiteral("author"), mapValue(resultMap, "author"));
0287 
0288   const QString bookFormat = mapValue(resultMap, "bookFormat");
0289   if(bookFormat == QLatin1String("https://schema.org/Paperback")) {
0290     entry->setField(QStringLiteral("binding"), i18n("Paperback"));
0291   } else if(bookFormat == QLatin1String("https://schema.org/Hardcover")) {
0292     entry->setField(QStringLiteral("binding"), i18n("Hardback"));
0293   } else if(bookFormat == QLatin1String("https://schema.org/EBook")) {
0294     entry->setField(QStringLiteral("binding"), i18n("E-Book"));
0295   }
0296 
0297   entry->setField(QStringLiteral("pub_year"), mapValue(resultMap, "datePublished"));
0298   entry->setField(QStringLiteral("isbn"), mapValue(resultMap, "isbn"));
0299 
0300   const QString id = ImageFactory::addImage(QUrl::fromUserInput(mapValue(resultMap, "image")),
0301                                             true /* quiet */);
0302   if(id.isEmpty()) {
0303     message(i18n("The cover image could not be loaded."), MessageHandler::Warning);
0304   }
0305   // empty image ID is ok
0306   entry->setField(QStringLiteral("cover"), id);
0307 
0308   // inLanguage is upper-case language code
0309   const QString lang = mapValue(resultMap, "inLanguage");
0310   entry->setField(QStringLiteral("language"), QLocale(lang.toLower()).nativeLanguageName());
0311 
0312   entry->setField(QStringLiteral("plot"), mapValue(resultMap, "description"));
0313   entry->setField(QStringLiteral("pages"), mapValue(resultMap, "numberOfPages"));
0314   entry->setField(QStringLiteral("publisher"), mapValue(resultMap, "publisher"));
0315 
0316   // multiple authors do not show up in the embedded JSON
0317   static const QRegularExpression titleDivRx(QLatin1String("<div id=\"title\">(.*?)</div>"),
0318                                              QRegularExpression::DotMatchesEverythingOption);
0319   const auto titleDivMatch = titleDivRx.match(str_);
0320   if(titleDivMatch.hasMatch()) {
0321     const QString titleDiv = titleDivMatch.captured(1);
0322     static const QRegularExpression authorRx(QLatin1String("<a href=\"/libri/autori/[^>]+?>(.*?)</a>"),
0323                                              QRegularExpression::DotMatchesEverythingOption);
0324     QStringList authors;
0325     auto i = authorRx.globalMatch(titleDiv);
0326     while(i.hasNext()) {
0327       const auto match = i.next();
0328       authors << match.captured(1).simplified();
0329     }
0330     if(!authors.isEmpty()) {
0331       entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString()));
0332     }
0333     // the title in the embedded loses its identifier? "La..."
0334     static const QRegularExpression labelRx(QLatin1String("<label>(.*?)</label>"),
0335                                             QRegularExpression::DotMatchesEverythingOption);
0336     const auto labelMatch = labelRx.match(titleDiv);
0337     if(labelMatch.hasMatch()) {
0338       entry->setField(QStringLiteral("title"), labelMatch.captured(1).simplified());
0339     }
0340   }
0341 
0342   static const QRegularExpression tagRx(QLatin1String("<.*?>"));
0343 
0344   // editor is not in embedded json
0345   static const QRegularExpression editorRx(QLatin1String("Curatore:.*?>(.*?)</a"),
0346                                            QRegularExpression::DotMatchesEverythingOption);
0347   auto match = editorRx.match(str_);
0348   if(match.hasMatch()) {
0349     entry->setField(QStringLiteral("editor"), match.captured(1).remove(tagRx).simplified());
0350   }
0351 
0352   // translator is not in embedded json
0353   static const QRegularExpression translatorRx(QLatin1String("Traduttore:.*?>(.*?)</a"),
0354                                                QRegularExpression::DotMatchesEverythingOption);
0355   match = translatorRx.match(str_);
0356   if(match.hasMatch()) {
0357     entry->setField(QStringLiteral("translator"), match.captured(1).remove(tagRx).simplified());
0358   }
0359 
0360   // edition is not in embedded json
0361   static const QRegularExpression editionRx(QLatin1String("Editore:.*?>(.*?)</a"),
0362                                             QRegularExpression::DotMatchesEverythingOption);
0363   match = editionRx.match(str_);
0364   if(match.hasMatch()) {
0365     entry->setField(QStringLiteral("edition"), match.captured(1).remove(tagRx).simplified());
0366   }
0367 
0368   // series is not in embedded json
0369   static const QRegularExpression seriesRx(QLatin1String("Collana:.*?>(.*?)</a"),
0370                                            QRegularExpression::DotMatchesEverythingOption);
0371   match = seriesRx.match(str_);
0372   if(match.hasMatch()) {
0373     entry->setField(QStringLiteral("series"), match.captured(1).remove(tagRx).simplified());
0374   }
0375 
0376   return entry;
0377 }
0378 
0379 Tellico::Fetch::FetchRequest IBSFetcher::updateRequest(Data::EntryPtr entry_) {
0380   QString isbn = entry_->field(QStringLiteral("isbn"));
0381   if(!isbn.isEmpty()) {
0382     return FetchRequest(Fetch::ISBN, isbn);
0383   }
0384   QString t = entry_->field(QStringLiteral("title"));
0385   if(!t.isEmpty()) {
0386     return FetchRequest(Fetch::Title, t);
0387   }
0388   return FetchRequest();
0389 }
0390 
0391 Tellico::Fetch::ConfigWidget* IBSFetcher::configWidget(QWidget* parent_) const {
0392   return new IBSFetcher::ConfigWidget(parent_);
0393 }
0394 
0395 QString IBSFetcher::defaultName() {
0396   return i18n("Internet Bookshop (ibs.it)");
0397 }
0398 
0399 QString IBSFetcher::defaultIcon() {
0400   return favIcon("http://www.ibs.it");
0401 }
0402 
0403 IBSFetcher::ConfigWidget::ConfigWidget(QWidget* parent_)
0404     : Fetch::ConfigWidget(parent_) {
0405   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0406   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0407   l->addStretch();
0408 }
0409 
0410 QString IBSFetcher::ConfigWidget::preferredName() const {
0411   return IBSFetcher::defaultName();
0412 }