File indexing completed on 2024-05-12 16:45:51

0001 /***************************************************************************
0002     Copyright (C) 2008-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "googlescholarfetcher.h"
0026 #include "../core/filehandler.h"
0027 #include "../translators/bibteximporter.h"
0028 #include "../collections/bibtexcollection.h"
0029 #include "../entry.h"
0030 #include "../utils/guiproxy.h"
0031 #include "../tellico_debug.h"
0032 
0033 #include <KLocalizedString>
0034 #include <KConfigGroup>
0035 #include <KIO/Job>
0036 #include <KIO/JobUiDelegate>
0037 #include <KJobWidgets/KJobWidgets>
0038 
0039 #include <QLabel>
0040 #include <QVBoxLayout>
0041 #include <QFile>
0042 #include <QTextCodec>
0043 #include <QUrlQuery>
0044 
0045 namespace {
0046   static const int GOOGLE_MAX_RETURNS_TOTAL = 20;
0047   static const char* SCHOLAR_BASE_URL = "http://scholar.google.com/scholar";
0048   static const char* SCHOLAR_SET_CONFIG_URL = "http://scholar.google.com/scholar_settings?hl=en&as_sdt=0,5";
0049   static const char* SCHOLAR_SET_BIBTEX_URL = "http://scholar.google.com/scholar_setprefs?hl=en&num=100&scis=yes&scisf=4&submit=";
0050 }
0051 
0052 using namespace Tellico;
0053 using Tellico::Fetch::GoogleScholarFetcher;
0054 
0055 GoogleScholarFetcher::GoogleScholarFetcher(QObject* parent_)
0056     : Fetcher(parent_),
0057       m_limit(GOOGLE_MAX_RETURNS_TOTAL), m_start(0), m_total(0), m_job(nullptr), m_started(false)
0058     , m_bibtexRx(QLatin1String("<a\\s.*?href\\s*=\\s*\"([^>]*scholar\\.bib[^>]*?)\""))
0059     , m_cookieIsSet(false) {
0060 }
0061 
0062 GoogleScholarFetcher::~GoogleScholarFetcher() {
0063 }
0064 
0065 QString GoogleScholarFetcher::source() const {
0066   return m_name.isEmpty() ? defaultName() : m_name;
0067 }
0068 
0069 bool GoogleScholarFetcher::canSearch(Fetch::FetchKey k) const {
0070   return k == Title || k == Person || k == Keyword;
0071 }
0072 
0073 bool GoogleScholarFetcher::canFetch(int type) const {
0074   return type == Data::Collection::Bibtex;
0075 }
0076 
0077 void GoogleScholarFetcher::readConfigHook(const KConfigGroup& config_) {
0078   Q_UNUSED(config_);
0079 }
0080 
0081 void GoogleScholarFetcher::search() {
0082   if(!m_cookieIsSet) {
0083     setBibtexCookie();
0084   }
0085   m_started = true;
0086   m_start = 0;
0087   m_total = -1;
0088   doSearch();
0089 }
0090 
0091 void GoogleScholarFetcher::continueSearch() {
0092   m_started = true;
0093   doSearch();
0094 }
0095 
0096 void GoogleScholarFetcher::doSearch() {
0097 //  myDebug() << "value = " << value_;
0098 
0099   QUrl u(QString::fromLatin1(SCHOLAR_BASE_URL));
0100   QUrlQuery q;
0101   q.addQueryItem(QStringLiteral("start"), QString::number(m_start));
0102 
0103   QString value = request().value();
0104   if(!value.startsWith(QLatin1Char('"'))) {
0105     value = QLatin1Char('"') + value;
0106   }
0107   if(!value.endsWith(QLatin1Char('"'))) {
0108     value += QLatin1Char('"');
0109   }
0110   switch(request().key()) {
0111     case Title:
0112       q.addQueryItem(QStringLiteral("q"), QStringLiteral("allintitle:%1").arg(request().value()));
0113       break;
0114 
0115     case Keyword:
0116       q.addQueryItem(QStringLiteral("q"), request().value());
0117       break;
0118 
0119     case Person:
0120       q.addQueryItem(QStringLiteral("q"), QStringLiteral("author:%1").arg(request().value()));
0121       break;
0122 
0123     default:
0124       myWarning() << "key not recognized: " << request().key();
0125       stop();
0126       return;
0127   }
0128   u.setQuery(q);
0129 //  myDebug() << "url: " << u.url();
0130 
0131   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0132   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0133   connect(m_job.data(), &KJob::result,
0134           this, &GoogleScholarFetcher::slotComplete);
0135 }
0136 
0137 void GoogleScholarFetcher::stop() {
0138   if(!m_started) {
0139     return;
0140   }
0141   if(m_job) {
0142     m_job->kill();
0143     m_job = nullptr;
0144   }
0145   m_started = false;
0146   emit signalDone(this);
0147 }
0148 
0149 void GoogleScholarFetcher::slotComplete(KJob*) {
0150 //  myDebug();
0151 
0152   if(m_job->error()) {
0153     m_job->uiDelegate()->showErrorMessage();
0154     stop();
0155     return;
0156   }
0157 
0158   QByteArray data = m_job->data();
0159   if(data.isEmpty()) {
0160     myDebug() << "no data";
0161     stop();
0162     return;
0163   }
0164   // see bug 319662. If fetcher is cancelled, job is killed
0165   // if the pointer is retained, it gets double-deleted
0166   m_job = nullptr;
0167 
0168   const QString text = QString::fromUtf8(data.constData(), data.size());
0169 
0170 #if 0
0171   myWarning() << "Remove debug from googlescholarfetcher.cpp";
0172   QFile f(QString::fromLatin1("/tmp/test.html"));
0173   if(f.open(QIODevice::WriteOnly)) {
0174     QTextStream t(&f);
0175     t.setCodec("UTF-8");
0176     t << text;
0177   }
0178   f.close();
0179 #endif
0180 
0181   QString bibtex;
0182   int count = 0;
0183   for(QRegularExpressionMatchIterator i = m_bibtexRx.globalMatch(text); count < m_limit && i.hasNext(); ++count) {
0184     QRegularExpressionMatch match = i.next();
0185     // for some reason, KIO and google don't return bibtex when '&' is escaped
0186     QString url = match.captured(1).replace(QLatin1String("&amp;"), QLatin1String("&"));
0187     QUrl bibtexUrl = QUrl(QString::fromLatin1(SCHOLAR_BASE_URL)).resolved(QUrl(url));
0188 //    myDebug() << bibtexUrl;
0189     bibtex += FileHandler::readTextFile(bibtexUrl, true);
0190   }
0191 
0192   Import::BibtexImporter imp(bibtex);
0193   // quiet warnings...
0194   imp.setCurrentCollection(Data::CollPtr(new Data::BibtexCollection(true)));
0195   Data::CollPtr coll = imp.collection();
0196   if(!coll) {
0197     myDebug() << "no collection pointer";
0198     stop();
0199     return;
0200   }
0201 
0202   count = 0;
0203   Data::EntryList entries = coll->entries();
0204   foreach(Data::EntryPtr entry, entries) {
0205     if(count >= m_limit) {
0206       break;
0207     }
0208     if(!m_started) {
0209       // might get aborted
0210       break;
0211     }
0212 
0213     FetchResult* r = new FetchResult(this, entry);
0214     m_entries.insert(r->uid, Data::EntryPtr(entry));
0215     emit signalResultFound(r);
0216     ++count;
0217   }
0218   m_start = m_entries.count();
0219 //  m_hasMoreResults = m_start <= m_total;
0220   m_hasMoreResults = false; // for now, no continued searches
0221 
0222   stop(); // required
0223 }
0224 
0225 Tellico::Data::EntryPtr GoogleScholarFetcher::fetchEntryHook(uint uid_) {
0226   return m_entries[uid_];
0227 }
0228 
0229 Tellico::Fetch::FetchRequest GoogleScholarFetcher::updateRequest(Data::EntryPtr entry_) {
0230   QString title = entry_->field(QStringLiteral("title"));
0231   if(!title.isEmpty()) {
0232     return FetchRequest(Title, title);
0233   }
0234   return FetchRequest();
0235 }
0236 
0237 Tellico::Fetch::ConfigWidget* GoogleScholarFetcher::configWidget(QWidget* parent_) const {
0238   return new GoogleScholarFetcher::ConfigWidget(parent_, this);
0239 }
0240 
0241 QString GoogleScholarFetcher::defaultName() {
0242   // no i18n
0243   return QStringLiteral("Google Scholar");
0244 }
0245 
0246 QString GoogleScholarFetcher::defaultIcon() {
0247   return favIcon("http://scholar.google.com");
0248 }
0249 
0250 void GoogleScholarFetcher::setBibtexCookie() {
0251   // it appears that the series of url reads are necessary to get the correct cookie set
0252   // have to set preferences to have bibtex output
0253   const QString text = FileHandler::readTextFile(QUrl(QString::fromLatin1(SCHOLAR_SET_CONFIG_URL)), true);
0254   // find hidden input variables
0255   QRegExp inputRx(QLatin1String("<input\\s+[^>]*\\s*type\\s*=\\s*\"hidden\"\\s+[^>]+>"));
0256   inputRx.setMinimal(true);
0257   QRegExp pairRx(QLatin1String("([^=\\s<]+)\\s*=\\s*\"?([^=\\s\">]+)\"?"));
0258   QHash<QString, QString> nameValues;
0259   for(int pos = inputRx.indexIn(text); pos > -1; pos = inputRx.indexIn(text, pos+inputRx.matchedLength())) {
0260     const QString input = inputRx.cap(0);
0261     QString name, value;
0262     for(int pos2 = pairRx.indexIn(input); pos2 > -1; pos2 = pairRx.indexIn(input, pos2+pairRx.matchedLength())) {
0263       if(pairRx.cap(1).toLower() == QLatin1String("name")) {
0264         name = pairRx.cap(2);
0265       } else if(pairRx.cap(1).toLower() == QLatin1String("value")) {
0266         value = pairRx.cap(2);
0267       }
0268     }
0269     if(!name.isEmpty() && !value.isEmpty()) {
0270       nameValues.insert(name, value);
0271     }
0272   }
0273   QString newUrl = QLatin1String(SCHOLAR_SET_BIBTEX_URL);
0274   for(QHash<QString, QString>::const_iterator i = nameValues.constBegin(); i != nameValues.constEnd(); ++i) {
0275     newUrl += QLatin1Char('&') + i.key() + QLatin1Char('=') + i.value();
0276   }
0277   FileHandler::readTextFile(QUrl(newUrl), true);
0278   m_cookieIsSet = true;
0279 }
0280 
0281 GoogleScholarFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const GoogleScholarFetcher* /*=0*/)
0282     : Fetch::ConfigWidget(parent_) {
0283   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0284   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0285   l->addStretch();
0286 }
0287 
0288 QString GoogleScholarFetcher::ConfigWidget::preferredName() const {
0289   return GoogleScholarFetcher::defaultName();
0290 }