File indexing completed on 2024-05-12 05:09:34

0001 /***************************************************************************
0002     Copyright (C) 2008-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "googlescholarfetcher.h"
0026 #include "../core/filehandler.h"
0027 #include "../translators/bibteximporter.h"
0028 #include "../collections/bibtexcollection.h"
0029 #include "../entry.h"
0030 #include "../utils/guiproxy.h"
0031 #include "../tellico_debug.h"
0032 
0033 #include <KLocalizedString>
0034 #include <KConfigGroup>
0035 #include <KIO/Job>
0036 #include <KIO/JobUiDelegate>
0037 #include <KJobWidgets/KJobWidgets>
0038 
0039 #include <QLabel>
0040 #include <QVBoxLayout>
0041 #include <QFile>
0042 #include <QTextCodec>
0043 #include <QUrlQuery>
0044 
0045 namespace {
0046   static const int GOOGLE_MAX_RETURNS_TOTAL = 20;
0047   static const char* SCHOLAR_BASE_URL = "https://scholar.google.com/scholar";
0048   static const char* SCHOLAR_SET_CONFIG_URL = "https://scholar.google.com/scholar_settings?sciifh=1&hl=en&as_sdt=0,47";
0049   static const char* SCHOLAR_SET_BIBTEX_URL = "https://scholar.google.com/scholar_setprefs?hl=en&num=100&scis=yes&scisf=4&submit=";
0050 }
0051 
0052 using namespace Tellico;
0053 using Tellico::Fetch::GoogleScholarFetcher;
0054 
0055 GoogleScholarFetcher::GoogleScholarFetcher(QObject* parent_)
0056     : Fetcher(parent_),
0057       m_limit(GOOGLE_MAX_RETURNS_TOTAL), m_start(0), m_total(0), m_job(nullptr), m_started(false)
0058     , m_cookieIsSet(false) {
0059 }
0060 
0061 GoogleScholarFetcher::~GoogleScholarFetcher() {
0062 }
0063 
0064 QString GoogleScholarFetcher::source() const {
0065   return m_name.isEmpty() ? defaultName() : m_name;
0066 }
0067 
0068 bool GoogleScholarFetcher::canSearch(Fetch::FetchKey k) const {
0069   return k == Title || k == Keyword;
0070 }
0071 
0072 bool GoogleScholarFetcher::canFetch(int type) const {
0073   return type == Data::Collection::Bibtex;
0074 }
0075 
0076 void GoogleScholarFetcher::readConfigHook(const KConfigGroup& config_) {
0077   Q_UNUSED(config_);
0078 }
0079 
0080 void GoogleScholarFetcher::search() {
0081   m_started = true;
0082   m_start = 0;
0083   m_total = -1;
0084   doSearch();
0085 }
0086 
0087 void GoogleScholarFetcher::continueSearch() {
0088   m_started = true;
0089   doSearch();
0090 }
0091 
0092 void GoogleScholarFetcher::doSearch() {
0093   QUrl u(QString::fromLatin1(SCHOLAR_BASE_URL));
0094   QUrlQuery q;
0095   q.addQueryItem(QStringLiteral("start"), QString::number(m_start));
0096 
0097   QString value = request().value();
0098   if(!value.startsWith(QLatin1Char('"'))) {
0099     value = QLatin1Char('"') + value;
0100   }
0101   if(!value.endsWith(QLatin1Char('"'))) {
0102     value += QLatin1Char('"');
0103   }
0104   switch(request().key()) {
0105     case Title:
0106       q.addQueryItem(QStringLiteral("q"), QStringLiteral("allintitle:%1").arg(request().value()));
0107       break;
0108 
0109     case Keyword:
0110       q.addQueryItem(QStringLiteral("q"), request().value());
0111       break;
0112 
0113     default:
0114       myWarning() << source() << "- key not recognized:" << request().key();
0115       stop();
0116       return;
0117   }
0118   u.setQuery(q);
0119 //  myDebug() << "url: " << u.url();
0120 
0121   if(!m_cookieIsSet) {
0122     setBibtexCookie();
0123   }
0124 
0125   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0126   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0127   connect(m_job.data(), &KJob::result,
0128           this, &GoogleScholarFetcher::slotComplete);
0129 }
0130 
0131 void GoogleScholarFetcher::stop() {
0132   if(!m_started) {
0133     return;
0134   }
0135   if(m_job) {
0136     m_job->kill();
0137     m_job = nullptr;
0138   }
0139   m_started = false;
0140   emit signalDone(this);
0141 }
0142 
0143 void GoogleScholarFetcher::slotComplete(KJob*) {
0144   if(m_job->error()) {
0145     m_job->uiDelegate()->showErrorMessage();
0146     stop();
0147     return;
0148   }
0149 
0150   QByteArray data = m_job->data();
0151   if(data.isEmpty()) {
0152     myDebug() << "no data";
0153     stop();
0154     return;
0155   }
0156   // see bug 319662. If fetcher is cancelled, job is killed
0157   // if the pointer is retained, it gets double-deleted
0158   m_job = nullptr;
0159 
0160   const QString text = QString::fromUtf8(data.constData(), data.size());
0161 
0162 #if 0
0163   myWarning() << "Remove debug from googlescholarfetcher.cpp";
0164   QFile f(QString::fromLatin1("/tmp/test.html"));
0165   if(f.open(QIODevice::WriteOnly)) {
0166     QTextStream t(&f);
0167     t.setCodec("UTF-8");
0168     t << text;
0169   }
0170   f.close();
0171 #endif
0172 
0173   static const QRegularExpression bibtexRx(QStringLiteral("<a\\s.*?href\\s*=\\s*\"([^>]*scholar\\.bib[^>]*?)\""));
0174   QString bibtex;
0175   int count = 0;
0176   for(QRegularExpressionMatchIterator i = bibtexRx.globalMatch(text); count < m_limit && i.hasNext(); ++count) {
0177     QRegularExpressionMatch match = i.next();
0178     // for some reason, KIO and google don't return bibtex when '&' is escaped
0179     QString url = match.captured(1).replace(QLatin1String("&amp;"), QLatin1String("&"));
0180     QUrl bibtexUrl = QUrl(QString::fromLatin1(SCHOLAR_BASE_URL)).resolved(QUrl(url));
0181 //    myDebug() << bibtexUrl;
0182     bibtex += FileHandler::readTextFile(bibtexUrl, true);
0183   }
0184 
0185   Import::BibtexImporter imp(bibtex);
0186   // quiet warnings...
0187   imp.setCurrentCollection(Data::CollPtr(new Data::BibtexCollection(true)));
0188   Data::CollPtr coll = imp.collection();
0189   if(!coll) {
0190     myDebug() << "no collection pointer";
0191     stop();
0192     return;
0193   }
0194 
0195   count = 0;
0196   Data::EntryList entries = coll->entries();
0197   foreach(Data::EntryPtr entry, entries) {
0198     if(count >= m_limit) {
0199       break;
0200     }
0201     if(!m_started) {
0202       // might get aborted
0203       break;
0204     }
0205 
0206     FetchResult* r = new FetchResult(this, entry);
0207     m_entries.insert(r->uid, Data::EntryPtr(entry));
0208     emit signalResultFound(r);
0209     ++count;
0210   }
0211   m_start = m_entries.count();
0212 //  m_hasMoreResults = m_start <= m_total;
0213   m_hasMoreResults = false; // for now, no continued searches
0214 
0215   stop(); // required
0216 }
0217 
0218 Tellico::Data::EntryPtr GoogleScholarFetcher::fetchEntryHook(uint uid_) {
0219   return m_entries[uid_];
0220 }
0221 
0222 Tellico::Fetch::FetchRequest GoogleScholarFetcher::updateRequest(Data::EntryPtr entry_) {
0223   QString title = entry_->field(QStringLiteral("title"));
0224   if(!title.isEmpty()) {
0225     return FetchRequest(Title, title);
0226   }
0227   return FetchRequest();
0228 }
0229 
0230 Tellico::Fetch::ConfigWidget* GoogleScholarFetcher::configWidget(QWidget* parent_) const {
0231   return new GoogleScholarFetcher::ConfigWidget(parent_, this);
0232 }
0233 
0234 QString GoogleScholarFetcher::defaultName() {
0235   // no i18n
0236   return QStringLiteral("Google Scholar");
0237 }
0238 
0239 QString GoogleScholarFetcher::defaultIcon() {
0240   return favIcon("http://scholar.google.com");
0241 }
0242 
0243 void GoogleScholarFetcher::setBibtexCookie() {
0244   // it appears that the series of url reads are necessary to get the correct cookie set
0245   // have to set preferences to have bibtex output
0246   const QString text = FileHandler::readTextFile(QUrl(QString::fromLatin1(SCHOLAR_SET_CONFIG_URL)), true);
0247   // find hidden input variables
0248   static const QRegularExpression inputRx(QLatin1String("<input\\s+[^>]*?\\s*?type\\s*?=\\s*?\"?hidden\"?\\s+?[^>]+?>"));
0249   static const QRegularExpression pairRx(QLatin1String("([^=\\s<]+?)\\s*=\\s*\"?([^=\\s\">]+?)\"?"));
0250   QHash<QString, QString> nameValues;
0251   auto i = inputRx.globalMatch(text);
0252   while(i.hasNext()) {
0253     auto match = i.next();
0254     const auto input = match.capturedRef(0);
0255     QString name, value;
0256     auto i2 = pairRx.globalMatch(input);
0257     while(i2.hasNext()) {
0258       const auto match2 = i2.next();
0259       if(match2.captured(1).toLower() == QLatin1String("name")) {
0260         name = match2.captured(2);
0261       } else if(match2.captured(1).toLower() == QLatin1String("value")) {
0262         value = match2.captured(2);
0263       }
0264     }
0265     if(!name.isEmpty() && !value.isEmpty()) {
0266       nameValues.insert(name, value);
0267     }
0268   }
0269   QString newUrl = QLatin1String(SCHOLAR_SET_BIBTEX_URL);
0270   for(QHash<QString, QString>::const_iterator i = nameValues.constBegin(); i != nameValues.constEnd(); ++i) {
0271     newUrl += QLatin1Char('&') + i.key() + QLatin1Char('=') + i.value();
0272   }
0273   FileHandler::readTextFile(QUrl(newUrl), true);
0274   m_cookieIsSet = true;
0275 }
0276 
0277 GoogleScholarFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const GoogleScholarFetcher* /*=0*/)
0278     : Fetch::ConfigWidget(parent_) {
0279   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0280   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0281   l->addStretch();
0282 }
0283 
0284 QString GoogleScholarFetcher::ConfigWidget::preferredName() const {
0285   return GoogleScholarFetcher::defaultName();
0286 }