File indexing completed on 2024-05-12 16:45:51
0001 /*************************************************************************** 0002 Copyright (C) 2006-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "ibsfetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../utils/string_utils.h" 0028 #include "../collections/bookcollection.h" 0029 #include "../entry.h" 0030 #include "../fieldformat.h" 0031 #include "../core/filehandler.h" 0032 #include "../images/imagefactory.h" 0033 #include "../utils/isbnvalidator.h" 0034 #include "../tellico_debug.h" 0035 0036 #include <KLocalizedString> 0037 #include <KIO/Job> 0038 #include <KJobUiDelegate> 0039 #include <KJobWidgets/KJobWidgets> 0040 0041 #include <QRegExp> 0042 #include <QLabel> 0043 #include <QFile> 0044 #include <QTextStream> 0045 #include <QVBoxLayout> 0046 #include <QUrlQuery> 0047 #include <QJsonDocument> 0048 #include <QJsonObject> 0049 0050 namespace { 0051 static const char* IBS_BASE_URL = "https://www.ibs.it/search/"; 0052 } 0053 0054 using namespace Tellico; 0055 using Tellico::Fetch::IBSFetcher; 0056 0057 IBSFetcher::IBSFetcher(QObject* parent_) 0058 : Fetcher(parent_), m_total(0), m_started(false) { 0059 } 0060 0061 IBSFetcher::~IBSFetcher() { 0062 } 0063 0064 QString IBSFetcher::source() const { 0065 return m_name.isEmpty() ? defaultName() : m_name; 0066 } 0067 0068 bool IBSFetcher::canFetch(int type) const { 0069 return type == Data::Collection::Book || type == Data::Collection::Bibtex; 0070 } 0071 0072 // No UPC or Raw for now. 0073 bool IBSFetcher::canSearch(Fetch::FetchKey k) const { 0074 return k == Title || k == Person || k == ISBN; 0075 } 0076 0077 void IBSFetcher::readConfigHook(const KConfigGroup& config_) { 0078 Q_UNUSED(config_); 0079 } 0080 0081 void IBSFetcher::search() { 0082 m_started = true; 0083 m_matches.clear(); 0084 0085 QUrl u(QString::fromLatin1(IBS_BASE_URL)); 0086 QUrlQuery q; 0087 q.addQueryItem(QStringLiteral("ts"), QStringLiteral("as")); 0088 q.addQueryItem(QStringLiteral("filterProduct_type"), QStringLiteral("ITBOOK")); 0089 0090 switch(request().key()) { 0091 case Title: 0092 { 0093 // can't have ampersands 0094 QString s = request().value(); 0095 s.remove(QLatin1Char('&')); 0096 q.addQueryItem(QStringLiteral("query"), s.simplified()); 0097 } 0098 break; 0099 0100 case ISBN: 0101 { 0102 QString s = request().value(); 0103 // limit to first isbn 0104 s = s.section(QLatin1Char(';'), 0, 0); 0105 // isbn13 search doesn't work? 0106 s = ISBNValidator::isbn13(s); 0107 // dashes don't work 0108 s.remove(QLatin1Char('-')); 0109 q.addQueryItem(QStringLiteral("query"), s); 0110 } 0111 break; 0112 0113 case Keyword: 0114 q.addQueryItem(QStringLiteral("query"), request().value()); 0115 break; 0116 0117 default: 0118 myWarning() << "key not recognized: " << request().key(); 0119 stop(); 0120 return; 0121 } 0122 u.setQuery(q); 0123 // myDebug() << "url: " << u.url(); 0124 0125 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0126 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0127 connect(m_job.data(), &KJob::result, this, &IBSFetcher::slotComplete); 0128 } 0129 0130 void IBSFetcher::stop() { 0131 if(!m_started) { 0132 return; 0133 } 0134 0135 if(m_job) { 0136 m_job->kill(); 0137 m_job = nullptr; 0138 } 0139 m_started = false; 0140 emit signalDone(this); 0141 } 0142 0143 void IBSFetcher::slotComplete(KJob*) { 0144 if(m_job->error()) { 0145 m_job->uiDelegate()->showErrorMessage(); 0146 stop(); 0147 return; 0148 } 0149 0150 QByteArray data = m_job->data(); 0151 if(data.isEmpty()) { 0152 myDebug() << "no data"; 0153 stop(); 0154 return; 0155 } 0156 0157 #if 0 0158 myWarning() << "Remove debug from ibsfetcher.cpp"; 0159 QFile f(QString::fromLatin1("/tmp/test-ibs.html")); 0160 if(f.open(QIODevice::WriteOnly)) { 0161 QTextStream t(&f); 0162 t.setCodec("UTF-8"); 0163 t << data; 0164 } 0165 f.close(); 0166 #endif 0167 0168 QString s = Tellico::decodeHTML(data); 0169 QRegularExpression itemRx(QLatin1String("<div class=\"cc-product-list-item.*?>(.+?)<!--"), 0170 QRegularExpression::DotMatchesEverythingOption); 0171 QRegularExpression titleRx(QStringLiteral("<div class=\"cc-content-title\">\\s*<a [^>]*href=\"(.+?)\"[^>]*?>(.+?)</a"), 0172 QRegularExpression::DotMatchesEverythingOption); 0173 QRegularExpression yearRx(QLatin1String("<span class=\"cc-owner\">.*?([12]\\d{3}).*?</"), 0174 QRegularExpression::DotMatchesEverythingOption); 0175 QRegularExpression tagRx(QLatin1String("<.*?>")); 0176 0177 QString url, title, year; 0178 auto matchIterator = itemRx.globalMatch(s); 0179 while(matchIterator.hasNext() && m_started) { 0180 auto itemMatch = matchIterator.next(); 0181 const QString s = itemMatch.captured(1); 0182 auto titleMatch = titleRx.match(s); 0183 if(titleMatch.hasMatch()) { 0184 url = titleMatch.captured(1); 0185 title = titleMatch.captured(2).remove(tagRx).simplified(); 0186 } 0187 auto yearMatch = yearRx.match(s); 0188 if(yearMatch.hasMatch()) { 0189 year = yearMatch.captured(1).remove(tagRx).simplified(); 0190 } 0191 if(!url.isEmpty() && !title.isEmpty()) { 0192 // the url probable contains & so be careful 0193 QUrl u = m_job->url(); 0194 u = u.resolved(QUrl(url.replace(QLatin1String("&"), QLatin1String("&")))); 0195 // myDebug() << u << title << year; 0196 FetchResult* r = new FetchResult(this, title, year); 0197 m_matches.insert(r->uid, u); 0198 emit signalResultFound(r); 0199 } 0200 } 0201 0202 // since the fetch is done, don't worry about holding the job pointer 0203 m_job = nullptr; 0204 stop(); 0205 } 0206 0207 Tellico::Data::EntryPtr IBSFetcher::fetchEntryHook(uint uid_) { 0208 // if we already grabbed this one, then just pull it out of the dict 0209 Data::EntryPtr entry = m_entries[uid_]; 0210 if(entry) { 0211 return entry; 0212 } 0213 0214 QUrl url = m_matches[uid_]; 0215 if(url.isEmpty()) { 0216 myWarning() << "no url in map"; 0217 return Data::EntryPtr(); 0218 } 0219 0220 QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true)); 0221 if(results.isEmpty()) { 0222 myDebug() << "no text results"; 0223 return Data::EntryPtr(); 0224 } 0225 0226 #if 0 0227 myDebug() << url.url(); 0228 myWarning() << "Remove debug2 from ibsfetcher.cpp"; 0229 QFile f(QLatin1String("/tmp/test-ibs2.html")); 0230 if(f.open(QIODevice::WriteOnly)) { 0231 QTextStream t(&f); 0232 t.setCodec("UTF-8"); 0233 t << results; 0234 } 0235 f.close(); 0236 #endif 0237 0238 entry = parseEntry(results); 0239 if(!entry) { 0240 myDebug() << "error in processing entry"; 0241 return Data::EntryPtr(); 0242 } 0243 m_entries.insert(uid_, entry); // keep for later 0244 return entry; 0245 } 0246 0247 Tellico::Data::EntryPtr IBSFetcher::parseEntry(const QString& str_) { 0248 QRegExp jsonRx(QLatin1String("<script type=\"application/ld\\+json\">(.*)</script")); 0249 jsonRx.setMinimal(true); 0250 0251 if(!str_.contains(jsonRx)) { 0252 myDebug() << "No JSON block"; 0253 return Data::EntryPtr(); 0254 } 0255 0256 #if 0 0257 myWarning() << "Remove json debug from ibsfetcher.cpp"; 0258 QFile f(QLatin1String("/tmp/test.json")); 0259 if(f.open(QIODevice::WriteOnly)) { 0260 QTextStream t(&f); 0261 t.setCodec("UTF-8"); 0262 t << jsonRx.cap(1); 0263 } 0264 f.close(); 0265 #endif 0266 QJsonDocument doc = QJsonDocument::fromJson(jsonRx.cap(1).toUtf8()); 0267 QVariantMap objectMap = doc.object().toVariantMap(); 0268 QVariantMap resultMap = objectMap.value(QStringLiteral("mainEntity")).toMap(); 0269 if(resultMap.isEmpty()) { 0270 myDebug() << "no JSON object"; 0271 return Data::EntryPtr(); 0272 } 0273 0274 Data::CollPtr coll(new Data::BookCollection(true)); 0275 Data::EntryPtr entry(new Data::Entry(coll)); 0276 0277 // as genre, take the last breadcrumb 0278 QString genre = mapValue(objectMap, "breadcrumb"); 0279 genre = genre.section(QStringLiteral(">"), -1); 0280 entry->setField(QStringLiteral("genre"), genre); 0281 0282 // the title in the embedded loses it's identifier? "La..." 0283 entry->setField(QStringLiteral("title"), mapValue(resultMap, "name")); 0284 entry->setField(QStringLiteral("author"), mapValue(resultMap, "author")); 0285 0286 const QString bookFormat = mapValue(resultMap, "bookFormat"); 0287 if(bookFormat == QLatin1String("https://schema.org/Paperback")) { 0288 entry->setField(QStringLiteral("binding"), i18n("Paperback")); 0289 } else if(bookFormat == QLatin1String("https://schema.org/Hardcover")) { 0290 entry->setField(QStringLiteral("binding"), i18n("Hardback")); 0291 } else if(bookFormat == QLatin1String("https://schema.org/EBook")) { 0292 entry->setField(QStringLiteral("binding"), i18n("E-Book")); 0293 } 0294 0295 entry->setField(QStringLiteral("pub_year"), mapValue(resultMap, "datePublished")); 0296 entry->setField(QStringLiteral("isbn"), mapValue(resultMap, "isbn")); 0297 0298 const QString id = ImageFactory::addImage(QUrl::fromUserInput(mapValue(resultMap, "image")), 0299 true /* quiet */); 0300 if(id.isEmpty()) { 0301 message(i18n("The cover image could not be loaded."), MessageHandler::Warning); 0302 } 0303 // empty image ID is ok 0304 entry->setField(QStringLiteral("cover"), id); 0305 0306 // inLanguage is upper-case language code 0307 const QString lang = mapValue(resultMap, "inLanguage"); 0308 entry->setField(QStringLiteral("language"), QLocale(lang.toLower()).nativeLanguageName()); 0309 0310 entry->setField(QStringLiteral("plot"), mapValue(resultMap, "description")); 0311 entry->setField(QStringLiteral("pages"), mapValue(resultMap, "numberOfPages")); 0312 entry->setField(QStringLiteral("publisher"), mapValue(resultMap, "publisher")); 0313 0314 // multiple authors do not show up in the embedded JSON 0315 QRegExp titleDivRx(QLatin1String("<div id=\"title\">(.*)</div>")); 0316 titleDivRx.setMinimal(true); 0317 if(str_.contains(titleDivRx)) { 0318 const QString titleDiv = titleDivRx.cap(1); 0319 QRegExp authorRx(QLatin1String("<a href=\"/libri/autori/[^>]+>(.*)</a>")); 0320 authorRx.setMinimal(true); 0321 QStringList authors; 0322 for(int pos = authorRx.indexIn(titleDiv); pos > -1; pos = authorRx.indexIn(titleDiv, pos+authorRx.matchedLength())) { 0323 authors << authorRx.cap(1).simplified(); 0324 } 0325 if(!authors.isEmpty()) { 0326 entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString())); 0327 } 0328 // the title in the embedded loses its identifier? "La..." 0329 QRegExp labelRx(QLatin1String("<label>(.*)</label>")); 0330 if(titleDiv.contains(labelRx)) { 0331 entry->setField(QStringLiteral("title"), labelRx.cap(1).simplified()); 0332 } 0333 } 0334 0335 QRegExp tagRx(QLatin1String("<.*>")); 0336 tagRx.setMinimal(true); 0337 0338 // editor is not in embedded json 0339 QRegExp editorRx(QLatin1String("Curatore:.*>(.*)</a")); 0340 editorRx.setMinimal(true); 0341 if(str_.contains(editorRx)) { 0342 entry->setField(QStringLiteral("editor"), editorRx.cap(1).remove(tagRx).simplified()); 0343 } 0344 0345 // translator is not in embedded json 0346 QRegExp translatorRx(QLatin1String("Traduttore:.*>(.*)</a")); 0347 translatorRx.setMinimal(true); 0348 if(str_.contains(translatorRx)) { 0349 entry->setField(QStringLiteral("translator"), translatorRx.cap(1).remove(tagRx).simplified()); 0350 } 0351 0352 // edition is not in embedded json 0353 QRegExp editionRx(QLatin1String("Editore:.*>(.*)</a")); 0354 editionRx.setMinimal(true); 0355 if(str_.contains(editionRx)) { 0356 entry->setField(QStringLiteral("edition"), editionRx.cap(1).remove(tagRx).simplified()); 0357 } 0358 0359 // series is not in embedded json 0360 QRegExp seriesRx(QLatin1String("Collana:.*>(.*)</a")); 0361 seriesRx.setMinimal(true); 0362 if(str_.contains(seriesRx)) { 0363 entry->setField(QStringLiteral("series"), seriesRx.cap(1).remove(tagRx).simplified()); 0364 } 0365 0366 return entry; 0367 } 0368 0369 Tellico::Fetch::FetchRequest IBSFetcher::updateRequest(Data::EntryPtr entry_) { 0370 QString isbn = entry_->field(QStringLiteral("isbn")); 0371 if(!isbn.isEmpty()) { 0372 return FetchRequest(Fetch::ISBN, isbn); 0373 } 0374 QString t = entry_->field(QStringLiteral("title")); 0375 if(!t.isEmpty()) { 0376 return FetchRequest(Fetch::Title, t); 0377 } 0378 return FetchRequest(); 0379 } 0380 0381 Tellico::Fetch::ConfigWidget* IBSFetcher::configWidget(QWidget* parent_) const { 0382 return new IBSFetcher::ConfigWidget(parent_); 0383 } 0384 0385 QString IBSFetcher::defaultName() { 0386 return i18n("Internet Bookshop (ibs.it)"); 0387 } 0388 0389 QString IBSFetcher::defaultIcon() { 0390 return favIcon("http://www.ibs.it"); 0391 } 0392 0393 IBSFetcher::ConfigWidget::ConfigWidget(QWidget* parent_) 0394 : Fetch::ConfigWidget(parent_) { 0395 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0396 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0397 l->addStretch(); 0398 } 0399 0400 QString IBSFetcher::ConfigWidget::preferredName() const { 0401 return IBSFetcher::defaultName(); 0402 }