File indexing completed on 2024-05-12 05:09:35
0001 /*************************************************************************** 0002 Copyright (C) 2006-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "ibsfetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../collections/bookcollection.h" 0028 #include "../entry.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../images/imagefactory.h" 0032 #include "../utils/isbnvalidator.h" 0033 #include "../utils/string_utils.h" 0034 #include "../utils/mapvalue.h" 0035 #include "../tellico_debug.h" 0036 0037 #include <KLocalizedString> 0038 #include <KIO/Job> 0039 #include <KJobUiDelegate> 0040 #include <KJobWidgets/KJobWidgets> 0041 0042 #include <QRegularExpression> 0043 #include <QLabel> 0044 #include <QFile> 0045 #include <QTextStream> 0046 #include <QVBoxLayout> 0047 #include <QUrlQuery> 0048 #include <QJsonDocument> 0049 #include <QJsonObject> 0050 0051 namespace { 0052 static const char* IBS_BASE_URL = "https://www.ibs.it/search/"; 0053 } 0054 0055 using namespace Tellico; 0056 using Tellico::Fetch::IBSFetcher; 0057 0058 IBSFetcher::IBSFetcher(QObject* parent_) 0059 : Fetcher(parent_), m_total(0), m_started(false) { 0060 } 0061 0062 IBSFetcher::~IBSFetcher() { 0063 } 0064 0065 QString IBSFetcher::source() const { 0066 return m_name.isEmpty() ? defaultName() : m_name; 0067 } 0068 0069 bool IBSFetcher::canFetch(int type) const { 0070 return type == Data::Collection::Book || type == Data::Collection::Bibtex; 0071 } 0072 0073 // No UPC or Raw for now. 0074 bool IBSFetcher::canSearch(Fetch::FetchKey k) const { 0075 return k == Title || k == Person || k == ISBN; 0076 } 0077 0078 void IBSFetcher::readConfigHook(const KConfigGroup& config_) { 0079 Q_UNUSED(config_); 0080 } 0081 0082 void IBSFetcher::search() { 0083 m_started = true; 0084 m_matches.clear(); 0085 0086 QUrl u(QString::fromLatin1(IBS_BASE_URL)); 0087 QUrlQuery q; 0088 q.addQueryItem(QStringLiteral("ts"), QStringLiteral("as")); 0089 q.addQueryItem(QStringLiteral("filterProduct_type"), QStringLiteral("ITBOOK")); 0090 0091 switch(request().key()) { 0092 case Title: 0093 { 0094 // can't have ampersands 0095 QString s = request().value(); 0096 s.remove(QLatin1Char('&')); 0097 q.addQueryItem(QStringLiteral("query"), s.simplified()); 0098 } 0099 break; 0100 0101 case ISBN: 0102 { 0103 QString s = request().value(); 0104 // limit to first isbn 0105 s = s.section(QLatin1Char(';'), 0, 0); 0106 // isbn13 search doesn't work? 0107 s = ISBNValidator::isbn13(s); 0108 // dashes don't work 0109 s.remove(QLatin1Char('-')); 0110 q.addQueryItem(QStringLiteral("query"), s); 0111 } 0112 break; 0113 0114 case Keyword: 0115 q.addQueryItem(QStringLiteral("query"), request().value()); 0116 break; 0117 0118 default: 0119 myWarning() << source() << "- key not recognized:" << request().key(); 0120 stop(); 0121 return; 0122 } 0123 u.setQuery(q); 0124 // myDebug() << "url: " << u.url(); 0125 0126 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0127 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0128 connect(m_job.data(), &KJob::result, this, &IBSFetcher::slotComplete); 0129 } 0130 0131 void IBSFetcher::stop() { 0132 if(!m_started) { 0133 return; 0134 } 0135 0136 if(m_job) { 0137 m_job->kill(); 0138 m_job = nullptr; 0139 } 0140 m_started = false; 0141 emit signalDone(this); 0142 } 0143 0144 void IBSFetcher::slotComplete(KJob*) { 0145 if(m_job->error()) { 0146 m_job->uiDelegate()->showErrorMessage(); 0147 stop(); 0148 return; 0149 } 0150 0151 QByteArray data = m_job->data(); 0152 if(data.isEmpty()) { 0153 myDebug() << "no data"; 0154 stop(); 0155 return; 0156 } 0157 0158 #if 0 0159 myWarning() << "Remove debug from ibsfetcher.cpp"; 0160 QFile f(QString::fromLatin1("/tmp/test-ibs.html")); 0161 if(f.open(QIODevice::WriteOnly)) { 0162 QTextStream t(&f); 0163 t.setCodec("UTF-8"); 0164 t << data; 0165 } 0166 f.close(); 0167 #endif 0168 0169 QString s = Tellico::decodeHTML(data); 0170 static const QRegularExpression itemRx(QLatin1String("<div class=\"cc-product-list-item.*?>(.+?)<!--"), 0171 QRegularExpression::DotMatchesEverythingOption); 0172 static const QRegularExpression titleRx(QStringLiteral("<a [^>]*href=\"(.+?)\"[^>]*?class=\"cc-title\">(.+?)</a"), 0173 QRegularExpression::DotMatchesEverythingOption); 0174 static const QRegularExpression yearRx(QLatin1String("<span class=\"cc-publisher\">.*?([12]\\d{3}).*?</"), 0175 QRegularExpression::DotMatchesEverythingOption); 0176 static const QRegularExpression tagRx(QLatin1String("<.*?>")); 0177 0178 QString url, title, year; 0179 auto matchIterator = itemRx.globalMatch(s); 0180 while(matchIterator.hasNext() && m_started) { 0181 auto itemMatch = matchIterator.next(); 0182 const QString s = itemMatch.captured(1); 0183 auto titleMatch = titleRx.match(s); 0184 if(titleMatch.hasMatch()) { 0185 url = titleMatch.captured(1); 0186 title = titleMatch.captured(2).remove(tagRx).simplified(); 0187 } 0188 auto yearMatch = yearRx.match(s); 0189 if(yearMatch.hasMatch()) { 0190 year = yearMatch.captured(1).remove(tagRx).simplified(); 0191 } 0192 if(!url.isEmpty() && !title.isEmpty()) { 0193 // the url probable contains & so be careful 0194 QUrl u = m_job->url(); 0195 u = u.resolved(QUrl(url.replace(QLatin1String("&"), QLatin1String("&")))); 0196 // myDebug() << u << title << year; 0197 FetchResult* r = new FetchResult(this, title, year); 0198 m_matches.insert(r->uid, u); 0199 emit signalResultFound(r); 0200 } 0201 } 0202 0203 // since the fetch is done, don't worry about holding the job pointer 0204 m_job = nullptr; 0205 stop(); 0206 } 0207 0208 Tellico::Data::EntryPtr IBSFetcher::fetchEntryHook(uint uid_) { 0209 // if we already grabbed this one, then just pull it out of the dict 0210 Data::EntryPtr entry = m_entries[uid_]; 0211 if(entry) { 0212 return entry; 0213 } 0214 0215 QUrl url = m_matches[uid_]; 0216 if(url.isEmpty()) { 0217 myWarning() << "no url in map"; 0218 return Data::EntryPtr(); 0219 } 0220 0221 QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true)); 0222 if(results.isEmpty()) { 0223 myDebug() << "no text results"; 0224 return Data::EntryPtr(); 0225 } 0226 0227 #if 0 0228 myDebug() << url.url(); 0229 myWarning() << "Remove debug2 from ibsfetcher.cpp"; 0230 QFile f(QLatin1String("/tmp/test-ibs2.html")); 0231 if(f.open(QIODevice::WriteOnly)) { 0232 QTextStream t(&f); 0233 t.setCodec("UTF-8"); 0234 t << results; 0235 } 0236 f.close(); 0237 #endif 0238 0239 entry = parseEntry(results); 0240 if(!entry) { 0241 myDebug() << "error in processing entry"; 0242 return Data::EntryPtr(); 0243 } 0244 m_entries.insert(uid_, entry); // keep for later 0245 return entry; 0246 } 0247 0248 Tellico::Data::EntryPtr IBSFetcher::parseEntry(const QString& str_) { 0249 static const QRegularExpression jsonRx(QLatin1String("<script type=\"application/ld\\+json\">(.*?)</script"), 0250 QRegularExpression::DotMatchesEverythingOption); 0251 0252 const auto jsonMatch = jsonRx.match(str_); 0253 if(!jsonMatch.hasMatch()) { 0254 myDebug() << "No JSON block"; 0255 return Data::EntryPtr(); 0256 } 0257 0258 #if 0 0259 myWarning() << "Remove json debug from ibsfetcher.cpp"; 0260 QFile f(QLatin1String("/tmp/test.json")); 0261 if(f.open(QIODevice::WriteOnly)) { 0262 QTextStream t(&f); 0263 t.setCodec("UTF-8"); 0264 t << jsonRx.cap(1); 0265 } 0266 f.close(); 0267 #endif 0268 QJsonDocument doc = QJsonDocument::fromJson(jsonMatch.capturedRef(1).toUtf8()); 0269 QVariantMap objectMap = doc.object().toVariantMap(); 0270 QVariantMap resultMap = objectMap.value(QStringLiteral("mainEntity")).toMap(); 0271 if(resultMap.isEmpty()) { 0272 myDebug() << "no JSON object"; 0273 return Data::EntryPtr(); 0274 } 0275 0276 Data::CollPtr coll(new Data::BookCollection(true)); 0277 Data::EntryPtr entry(new Data::Entry(coll)); 0278 0279 // as genre, take the last breadcrumb 0280 QString genre = mapValue(objectMap, "breadcrumb"); 0281 genre = genre.section(QStringLiteral(">"), -1); 0282 entry->setField(QStringLiteral("genre"), genre); 0283 0284 // the title in the embedded loses it's identifier? "La..." 0285 entry->setField(QStringLiteral("title"), mapValue(resultMap, "name")); 0286 entry->setField(QStringLiteral("author"), mapValue(resultMap, "author")); 0287 0288 const QString bookFormat = mapValue(resultMap, "bookFormat"); 0289 if(bookFormat == QLatin1String("https://schema.org/Paperback")) { 0290 entry->setField(QStringLiteral("binding"), i18n("Paperback")); 0291 } else if(bookFormat == QLatin1String("https://schema.org/Hardcover")) { 0292 entry->setField(QStringLiteral("binding"), i18n("Hardback")); 0293 } else if(bookFormat == QLatin1String("https://schema.org/EBook")) { 0294 entry->setField(QStringLiteral("binding"), i18n("E-Book")); 0295 } 0296 0297 entry->setField(QStringLiteral("pub_year"), mapValue(resultMap, "datePublished")); 0298 entry->setField(QStringLiteral("isbn"), mapValue(resultMap, "isbn")); 0299 0300 const QString id = ImageFactory::addImage(QUrl::fromUserInput(mapValue(resultMap, "image")), 0301 true /* quiet */); 0302 if(id.isEmpty()) { 0303 message(i18n("The cover image could not be loaded."), MessageHandler::Warning); 0304 } 0305 // empty image ID is ok 0306 entry->setField(QStringLiteral("cover"), id); 0307 0308 // inLanguage is upper-case language code 0309 const QString lang = mapValue(resultMap, "inLanguage"); 0310 entry->setField(QStringLiteral("language"), QLocale(lang.toLower()).nativeLanguageName()); 0311 0312 entry->setField(QStringLiteral("plot"), mapValue(resultMap, "description")); 0313 entry->setField(QStringLiteral("pages"), mapValue(resultMap, "numberOfPages")); 0314 entry->setField(QStringLiteral("publisher"), mapValue(resultMap, "publisher")); 0315 0316 // multiple authors do not show up in the embedded JSON 0317 static const QRegularExpression titleDivRx(QLatin1String("<div id=\"title\">(.*?)</div>"), 0318 QRegularExpression::DotMatchesEverythingOption); 0319 const auto titleDivMatch = titleDivRx.match(str_); 0320 if(titleDivMatch.hasMatch()) { 0321 const QString titleDiv = titleDivMatch.captured(1); 0322 static const QRegularExpression authorRx(QLatin1String("<a href=\"/libri/autori/[^>]+?>(.*?)</a>"), 0323 QRegularExpression::DotMatchesEverythingOption); 0324 QStringList authors; 0325 auto i = authorRx.globalMatch(titleDiv); 0326 while(i.hasNext()) { 0327 const auto match = i.next(); 0328 authors << match.captured(1).simplified(); 0329 } 0330 if(!authors.isEmpty()) { 0331 entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString())); 0332 } 0333 // the title in the embedded loses its identifier? "La..." 0334 static const QRegularExpression labelRx(QLatin1String("<label>(.*?)</label>"), 0335 QRegularExpression::DotMatchesEverythingOption); 0336 const auto labelMatch = labelRx.match(titleDiv); 0337 if(labelMatch.hasMatch()) { 0338 entry->setField(QStringLiteral("title"), labelMatch.captured(1).simplified()); 0339 } 0340 } 0341 0342 static const QRegularExpression tagRx(QLatin1String("<.*?>")); 0343 0344 // editor is not in embedded json 0345 static const QRegularExpression editorRx(QLatin1String("Curatore:.*?>(.*?)</a"), 0346 QRegularExpression::DotMatchesEverythingOption); 0347 auto match = editorRx.match(str_); 0348 if(match.hasMatch()) { 0349 entry->setField(QStringLiteral("editor"), match.captured(1).remove(tagRx).simplified()); 0350 } 0351 0352 // translator is not in embedded json 0353 static const QRegularExpression translatorRx(QLatin1String("Traduttore:.*?>(.*?)</a"), 0354 QRegularExpression::DotMatchesEverythingOption); 0355 match = translatorRx.match(str_); 0356 if(match.hasMatch()) { 0357 entry->setField(QStringLiteral("translator"), match.captured(1).remove(tagRx).simplified()); 0358 } 0359 0360 // edition is not in embedded json 0361 static const QRegularExpression editionRx(QLatin1String("Editore:.*?>(.*?)</a"), 0362 QRegularExpression::DotMatchesEverythingOption); 0363 match = editionRx.match(str_); 0364 if(match.hasMatch()) { 0365 entry->setField(QStringLiteral("edition"), match.captured(1).remove(tagRx).simplified()); 0366 } 0367 0368 // series is not in embedded json 0369 static const QRegularExpression seriesRx(QLatin1String("Collana:.*?>(.*?)</a"), 0370 QRegularExpression::DotMatchesEverythingOption); 0371 match = seriesRx.match(str_); 0372 if(match.hasMatch()) { 0373 entry->setField(QStringLiteral("series"), match.captured(1).remove(tagRx).simplified()); 0374 } 0375 0376 return entry; 0377 } 0378 0379 Tellico::Fetch::FetchRequest IBSFetcher::updateRequest(Data::EntryPtr entry_) { 0380 QString isbn = entry_->field(QStringLiteral("isbn")); 0381 if(!isbn.isEmpty()) { 0382 return FetchRequest(Fetch::ISBN, isbn); 0383 } 0384 QString t = entry_->field(QStringLiteral("title")); 0385 if(!t.isEmpty()) { 0386 return FetchRequest(Fetch::Title, t); 0387 } 0388 return FetchRequest(); 0389 } 0390 0391 Tellico::Fetch::ConfigWidget* IBSFetcher::configWidget(QWidget* parent_) const { 0392 return new IBSFetcher::ConfigWidget(parent_); 0393 } 0394 0395 QString IBSFetcher::defaultName() { 0396 return i18n("Internet Bookshop (ibs.it)"); 0397 } 0398 0399 QString IBSFetcher::defaultIcon() { 0400 return favIcon("http://www.ibs.it"); 0401 } 0402 0403 IBSFetcher::ConfigWidget::ConfigWidget(QWidget* parent_) 0404 : Fetch::ConfigWidget(parent_) { 0405 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0406 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0407 l->addStretch(); 0408 } 0409 0410 QString IBSFetcher::ConfigWidget::preferredName() const { 0411 return IBSFetcher::defaultName(); 0412 }