File indexing completed on 2024-05-12 05:09:34
0001 /*************************************************************************** 0002 Copyright (C) 2008-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "googlescholarfetcher.h" 0026 #include "../core/filehandler.h" 0027 #include "../translators/bibteximporter.h" 0028 #include "../collections/bibtexcollection.h" 0029 #include "../entry.h" 0030 #include "../utils/guiproxy.h" 0031 #include "../tellico_debug.h" 0032 0033 #include <KLocalizedString> 0034 #include <KConfigGroup> 0035 #include <KIO/Job> 0036 #include <KIO/JobUiDelegate> 0037 #include <KJobWidgets/KJobWidgets> 0038 0039 #include <QLabel> 0040 #include <QVBoxLayout> 0041 #include <QFile> 0042 #include <QTextCodec> 0043 #include <QUrlQuery> 0044 0045 namespace { 0046 static const int GOOGLE_MAX_RETURNS_TOTAL = 20; 0047 static const char* SCHOLAR_BASE_URL = "https://scholar.google.com/scholar"; 0048 static const char* SCHOLAR_SET_CONFIG_URL = "https://scholar.google.com/scholar_settings?sciifh=1&hl=en&as_sdt=0,47"; 0049 static const char* SCHOLAR_SET_BIBTEX_URL = "https://scholar.google.com/scholar_setprefs?hl=en&num=100&scis=yes&scisf=4&submit="; 0050 } 0051 0052 using namespace Tellico; 0053 using Tellico::Fetch::GoogleScholarFetcher; 0054 0055 GoogleScholarFetcher::GoogleScholarFetcher(QObject* parent_) 0056 : Fetcher(parent_), 0057 m_limit(GOOGLE_MAX_RETURNS_TOTAL), m_start(0), m_total(0), m_job(nullptr), m_started(false) 0058 , m_cookieIsSet(false) { 0059 } 0060 0061 GoogleScholarFetcher::~GoogleScholarFetcher() { 0062 } 0063 0064 QString GoogleScholarFetcher::source() const { 0065 return m_name.isEmpty() ? defaultName() : m_name; 0066 } 0067 0068 bool GoogleScholarFetcher::canSearch(Fetch::FetchKey k) const { 0069 return k == Title || k == Keyword; 0070 } 0071 0072 bool GoogleScholarFetcher::canFetch(int type) const { 0073 return type == Data::Collection::Bibtex; 0074 } 0075 0076 void GoogleScholarFetcher::readConfigHook(const KConfigGroup& config_) { 0077 Q_UNUSED(config_); 0078 } 0079 0080 void GoogleScholarFetcher::search() { 0081 m_started = true; 0082 m_start = 0; 0083 m_total = -1; 0084 doSearch(); 0085 } 0086 0087 void GoogleScholarFetcher::continueSearch() { 0088 m_started = true; 0089 doSearch(); 0090 } 0091 0092 void GoogleScholarFetcher::doSearch() { 0093 QUrl u(QString::fromLatin1(SCHOLAR_BASE_URL)); 0094 QUrlQuery q; 0095 q.addQueryItem(QStringLiteral("start"), QString::number(m_start)); 0096 0097 QString value = request().value(); 0098 if(!value.startsWith(QLatin1Char('"'))) { 0099 value = QLatin1Char('"') + value; 0100 } 0101 if(!value.endsWith(QLatin1Char('"'))) { 0102 value += QLatin1Char('"'); 0103 } 0104 switch(request().key()) { 0105 case Title: 0106 q.addQueryItem(QStringLiteral("q"), QStringLiteral("allintitle:%1").arg(request().value())); 0107 break; 0108 0109 case Keyword: 0110 q.addQueryItem(QStringLiteral("q"), request().value()); 0111 break; 0112 0113 default: 0114 myWarning() << source() << "- key not recognized:" << request().key(); 0115 stop(); 0116 return; 0117 } 0118 u.setQuery(q); 0119 // myDebug() << "url: " << u.url(); 0120 0121 if(!m_cookieIsSet) { 0122 setBibtexCookie(); 0123 } 0124 0125 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0126 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0127 connect(m_job.data(), &KJob::result, 0128 this, &GoogleScholarFetcher::slotComplete); 0129 } 0130 0131 void GoogleScholarFetcher::stop() { 0132 if(!m_started) { 0133 return; 0134 } 0135 if(m_job) { 0136 m_job->kill(); 0137 m_job = nullptr; 0138 } 0139 m_started = false; 0140 emit signalDone(this); 0141 } 0142 0143 void GoogleScholarFetcher::slotComplete(KJob*) { 0144 if(m_job->error()) { 0145 m_job->uiDelegate()->showErrorMessage(); 0146 stop(); 0147 return; 0148 } 0149 0150 QByteArray data = m_job->data(); 0151 if(data.isEmpty()) { 0152 myDebug() << "no data"; 0153 stop(); 0154 return; 0155 } 0156 // see bug 319662. If fetcher is cancelled, job is killed 0157 // if the pointer is retained, it gets double-deleted 0158 m_job = nullptr; 0159 0160 const QString text = QString::fromUtf8(data.constData(), data.size()); 0161 0162 #if 0 0163 myWarning() << "Remove debug from googlescholarfetcher.cpp"; 0164 QFile f(QString::fromLatin1("/tmp/test.html")); 0165 if(f.open(QIODevice::WriteOnly)) { 0166 QTextStream t(&f); 0167 t.setCodec("UTF-8"); 0168 t << text; 0169 } 0170 f.close(); 0171 #endif 0172 0173 static const QRegularExpression bibtexRx(QStringLiteral("<a\\s.*?href\\s*=\\s*\"([^>]*scholar\\.bib[^>]*?)\"")); 0174 QString bibtex; 0175 int count = 0; 0176 for(QRegularExpressionMatchIterator i = bibtexRx.globalMatch(text); count < m_limit && i.hasNext(); ++count) { 0177 QRegularExpressionMatch match = i.next(); 0178 // for some reason, KIO and google don't return bibtex when '&' is escaped 0179 QString url = match.captured(1).replace(QLatin1String("&"), QLatin1String("&")); 0180 QUrl bibtexUrl = QUrl(QString::fromLatin1(SCHOLAR_BASE_URL)).resolved(QUrl(url)); 0181 // myDebug() << bibtexUrl; 0182 bibtex += FileHandler::readTextFile(bibtexUrl, true); 0183 } 0184 0185 Import::BibtexImporter imp(bibtex); 0186 // quiet warnings... 0187 imp.setCurrentCollection(Data::CollPtr(new Data::BibtexCollection(true))); 0188 Data::CollPtr coll = imp.collection(); 0189 if(!coll) { 0190 myDebug() << "no collection pointer"; 0191 stop(); 0192 return; 0193 } 0194 0195 count = 0; 0196 Data::EntryList entries = coll->entries(); 0197 foreach(Data::EntryPtr entry, entries) { 0198 if(count >= m_limit) { 0199 break; 0200 } 0201 if(!m_started) { 0202 // might get aborted 0203 break; 0204 } 0205 0206 FetchResult* r = new FetchResult(this, entry); 0207 m_entries.insert(r->uid, Data::EntryPtr(entry)); 0208 emit signalResultFound(r); 0209 ++count; 0210 } 0211 m_start = m_entries.count(); 0212 // m_hasMoreResults = m_start <= m_total; 0213 m_hasMoreResults = false; // for now, no continued searches 0214 0215 stop(); // required 0216 } 0217 0218 Tellico::Data::EntryPtr GoogleScholarFetcher::fetchEntryHook(uint uid_) { 0219 return m_entries[uid_]; 0220 } 0221 0222 Tellico::Fetch::FetchRequest GoogleScholarFetcher::updateRequest(Data::EntryPtr entry_) { 0223 QString title = entry_->field(QStringLiteral("title")); 0224 if(!title.isEmpty()) { 0225 return FetchRequest(Title, title); 0226 } 0227 return FetchRequest(); 0228 } 0229 0230 Tellico::Fetch::ConfigWidget* GoogleScholarFetcher::configWidget(QWidget* parent_) const { 0231 return new GoogleScholarFetcher::ConfigWidget(parent_, this); 0232 } 0233 0234 QString GoogleScholarFetcher::defaultName() { 0235 // no i18n 0236 return QStringLiteral("Google Scholar"); 0237 } 0238 0239 QString GoogleScholarFetcher::defaultIcon() { 0240 return favIcon("http://scholar.google.com"); 0241 } 0242 0243 void GoogleScholarFetcher::setBibtexCookie() { 0244 // it appears that the series of url reads are necessary to get the correct cookie set 0245 // have to set preferences to have bibtex output 0246 const QString text = FileHandler::readTextFile(QUrl(QString::fromLatin1(SCHOLAR_SET_CONFIG_URL)), true); 0247 // find hidden input variables 0248 static const QRegularExpression inputRx(QLatin1String("<input\\s+[^>]*?\\s*?type\\s*?=\\s*?\"?hidden\"?\\s+?[^>]+?>")); 0249 static const QRegularExpression pairRx(QLatin1String("([^=\\s<]+?)\\s*=\\s*\"?([^=\\s\">]+?)\"?")); 0250 QHash<QString, QString> nameValues; 0251 auto i = inputRx.globalMatch(text); 0252 while(i.hasNext()) { 0253 auto match = i.next(); 0254 const auto input = match.capturedRef(0); 0255 QString name, value; 0256 auto i2 = pairRx.globalMatch(input); 0257 while(i2.hasNext()) { 0258 const auto match2 = i2.next(); 0259 if(match2.captured(1).toLower() == QLatin1String("name")) { 0260 name = match2.captured(2); 0261 } else if(match2.captured(1).toLower() == QLatin1String("value")) { 0262 value = match2.captured(2); 0263 } 0264 } 0265 if(!name.isEmpty() && !value.isEmpty()) { 0266 nameValues.insert(name, value); 0267 } 0268 } 0269 QString newUrl = QLatin1String(SCHOLAR_SET_BIBTEX_URL); 0270 for(QHash<QString, QString>::const_iterator i = nameValues.constBegin(); i != nameValues.constEnd(); ++i) { 0271 newUrl += QLatin1Char('&') + i.key() + QLatin1Char('=') + i.value(); 0272 } 0273 FileHandler::readTextFile(QUrl(newUrl), true); 0274 m_cookieIsSet = true; 0275 } 0276 0277 GoogleScholarFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const GoogleScholarFetcher* /*=0*/) 0278 : Fetch::ConfigWidget(parent_) { 0279 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0280 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0281 l->addStretch(); 0282 } 0283 0284 QString GoogleScholarFetcher::ConfigWidget::preferredName() const { 0285 return GoogleScholarFetcher::defaultName(); 0286 }