File indexing completed on 2024-05-12 16:45:51
0001 /*************************************************************************** 0002 Copyright (C) 2008-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "googlescholarfetcher.h" 0026 #include "../core/filehandler.h" 0027 #include "../translators/bibteximporter.h" 0028 #include "../collections/bibtexcollection.h" 0029 #include "../entry.h" 0030 #include "../utils/guiproxy.h" 0031 #include "../tellico_debug.h" 0032 0033 #include <KLocalizedString> 0034 #include <KConfigGroup> 0035 #include <KIO/Job> 0036 #include <KIO/JobUiDelegate> 0037 #include <KJobWidgets/KJobWidgets> 0038 0039 #include <QLabel> 0040 #include <QVBoxLayout> 0041 #include <QFile> 0042 #include <QTextCodec> 0043 #include <QUrlQuery> 0044 0045 namespace { 0046 static const int GOOGLE_MAX_RETURNS_TOTAL = 20; 0047 static const char* SCHOLAR_BASE_URL = "http://scholar.google.com/scholar"; 0048 static const char* SCHOLAR_SET_CONFIG_URL = "http://scholar.google.com/scholar_settings?hl=en&as_sdt=0,5"; 0049 static const char* SCHOLAR_SET_BIBTEX_URL = "http://scholar.google.com/scholar_setprefs?hl=en&num=100&scis=yes&scisf=4&submit="; 0050 } 0051 0052 using namespace Tellico; 0053 using Tellico::Fetch::GoogleScholarFetcher; 0054 0055 GoogleScholarFetcher::GoogleScholarFetcher(QObject* parent_) 0056 : Fetcher(parent_), 0057 m_limit(GOOGLE_MAX_RETURNS_TOTAL), m_start(0), m_total(0), m_job(nullptr), m_started(false) 0058 , m_bibtexRx(QLatin1String("<a\\s.*?href\\s*=\\s*\"([^>]*scholar\\.bib[^>]*?)\"")) 0059 , m_cookieIsSet(false) { 0060 } 0061 0062 GoogleScholarFetcher::~GoogleScholarFetcher() { 0063 } 0064 0065 QString GoogleScholarFetcher::source() const { 0066 return m_name.isEmpty() ? defaultName() : m_name; 0067 } 0068 0069 bool GoogleScholarFetcher::canSearch(Fetch::FetchKey k) const { 0070 return k == Title || k == Person || k == Keyword; 0071 } 0072 0073 bool GoogleScholarFetcher::canFetch(int type) const { 0074 return type == Data::Collection::Bibtex; 0075 } 0076 0077 void GoogleScholarFetcher::readConfigHook(const KConfigGroup& config_) { 0078 Q_UNUSED(config_); 0079 } 0080 0081 void GoogleScholarFetcher::search() { 0082 if(!m_cookieIsSet) { 0083 setBibtexCookie(); 0084 } 0085 m_started = true; 0086 m_start = 0; 0087 m_total = -1; 0088 doSearch(); 0089 } 0090 0091 void GoogleScholarFetcher::continueSearch() { 0092 m_started = true; 0093 doSearch(); 0094 } 0095 0096 void GoogleScholarFetcher::doSearch() { 0097 // myDebug() << "value = " << value_; 0098 0099 QUrl u(QString::fromLatin1(SCHOLAR_BASE_URL)); 0100 QUrlQuery q; 0101 q.addQueryItem(QStringLiteral("start"), QString::number(m_start)); 0102 0103 QString value = request().value(); 0104 if(!value.startsWith(QLatin1Char('"'))) { 0105 value = QLatin1Char('"') + value; 0106 } 0107 if(!value.endsWith(QLatin1Char('"'))) { 0108 value += QLatin1Char('"'); 0109 } 0110 switch(request().key()) { 0111 case Title: 0112 q.addQueryItem(QStringLiteral("q"), QStringLiteral("allintitle:%1").arg(request().value())); 0113 break; 0114 0115 case Keyword: 0116 q.addQueryItem(QStringLiteral("q"), request().value()); 0117 break; 0118 0119 case Person: 0120 q.addQueryItem(QStringLiteral("q"), QStringLiteral("author:%1").arg(request().value())); 0121 break; 0122 0123 default: 0124 myWarning() << "key not recognized: " << request().key(); 0125 stop(); 0126 return; 0127 } 0128 u.setQuery(q); 0129 // myDebug() << "url: " << u.url(); 0130 0131 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0132 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0133 connect(m_job.data(), &KJob::result, 0134 this, &GoogleScholarFetcher::slotComplete); 0135 } 0136 0137 void GoogleScholarFetcher::stop() { 0138 if(!m_started) { 0139 return; 0140 } 0141 if(m_job) { 0142 m_job->kill(); 0143 m_job = nullptr; 0144 } 0145 m_started = false; 0146 emit signalDone(this); 0147 } 0148 0149 void GoogleScholarFetcher::slotComplete(KJob*) { 0150 // myDebug(); 0151 0152 if(m_job->error()) { 0153 m_job->uiDelegate()->showErrorMessage(); 0154 stop(); 0155 return; 0156 } 0157 0158 QByteArray data = m_job->data(); 0159 if(data.isEmpty()) { 0160 myDebug() << "no data"; 0161 stop(); 0162 return; 0163 } 0164 // see bug 319662. If fetcher is cancelled, job is killed 0165 // if the pointer is retained, it gets double-deleted 0166 m_job = nullptr; 0167 0168 const QString text = QString::fromUtf8(data.constData(), data.size()); 0169 0170 #if 0 0171 myWarning() << "Remove debug from googlescholarfetcher.cpp"; 0172 QFile f(QString::fromLatin1("/tmp/test.html")); 0173 if(f.open(QIODevice::WriteOnly)) { 0174 QTextStream t(&f); 0175 t.setCodec("UTF-8"); 0176 t << text; 0177 } 0178 f.close(); 0179 #endif 0180 0181 QString bibtex; 0182 int count = 0; 0183 for(QRegularExpressionMatchIterator i = m_bibtexRx.globalMatch(text); count < m_limit && i.hasNext(); ++count) { 0184 QRegularExpressionMatch match = i.next(); 0185 // for some reason, KIO and google don't return bibtex when '&' is escaped 0186 QString url = match.captured(1).replace(QLatin1String("&"), QLatin1String("&")); 0187 QUrl bibtexUrl = QUrl(QString::fromLatin1(SCHOLAR_BASE_URL)).resolved(QUrl(url)); 0188 // myDebug() << bibtexUrl; 0189 bibtex += FileHandler::readTextFile(bibtexUrl, true); 0190 } 0191 0192 Import::BibtexImporter imp(bibtex); 0193 // quiet warnings... 0194 imp.setCurrentCollection(Data::CollPtr(new Data::BibtexCollection(true))); 0195 Data::CollPtr coll = imp.collection(); 0196 if(!coll) { 0197 myDebug() << "no collection pointer"; 0198 stop(); 0199 return; 0200 } 0201 0202 count = 0; 0203 Data::EntryList entries = coll->entries(); 0204 foreach(Data::EntryPtr entry, entries) { 0205 if(count >= m_limit) { 0206 break; 0207 } 0208 if(!m_started) { 0209 // might get aborted 0210 break; 0211 } 0212 0213 FetchResult* r = new FetchResult(this, entry); 0214 m_entries.insert(r->uid, Data::EntryPtr(entry)); 0215 emit signalResultFound(r); 0216 ++count; 0217 } 0218 m_start = m_entries.count(); 0219 // m_hasMoreResults = m_start <= m_total; 0220 m_hasMoreResults = false; // for now, no continued searches 0221 0222 stop(); // required 0223 } 0224 0225 Tellico::Data::EntryPtr GoogleScholarFetcher::fetchEntryHook(uint uid_) { 0226 return m_entries[uid_]; 0227 } 0228 0229 Tellico::Fetch::FetchRequest GoogleScholarFetcher::updateRequest(Data::EntryPtr entry_) { 0230 QString title = entry_->field(QStringLiteral("title")); 0231 if(!title.isEmpty()) { 0232 return FetchRequest(Title, title); 0233 } 0234 return FetchRequest(); 0235 } 0236 0237 Tellico::Fetch::ConfigWidget* GoogleScholarFetcher::configWidget(QWidget* parent_) const { 0238 return new GoogleScholarFetcher::ConfigWidget(parent_, this); 0239 } 0240 0241 QString GoogleScholarFetcher::defaultName() { 0242 // no i18n 0243 return QStringLiteral("Google Scholar"); 0244 } 0245 0246 QString GoogleScholarFetcher::defaultIcon() { 0247 return favIcon("http://scholar.google.com"); 0248 } 0249 0250 void GoogleScholarFetcher::setBibtexCookie() { 0251 // it appears that the series of url reads are necessary to get the correct cookie set 0252 // have to set preferences to have bibtex output 0253 const QString text = FileHandler::readTextFile(QUrl(QString::fromLatin1(SCHOLAR_SET_CONFIG_URL)), true); 0254 // find hidden input variables 0255 QRegExp inputRx(QLatin1String("<input\\s+[^>]*\\s*type\\s*=\\s*\"hidden\"\\s+[^>]+>")); 0256 inputRx.setMinimal(true); 0257 QRegExp pairRx(QLatin1String("([^=\\s<]+)\\s*=\\s*\"?([^=\\s\">]+)\"?")); 0258 QHash<QString, QString> nameValues; 0259 for(int pos = inputRx.indexIn(text); pos > -1; pos = inputRx.indexIn(text, pos+inputRx.matchedLength())) { 0260 const QString input = inputRx.cap(0); 0261 QString name, value; 0262 for(int pos2 = pairRx.indexIn(input); pos2 > -1; pos2 = pairRx.indexIn(input, pos2+pairRx.matchedLength())) { 0263 if(pairRx.cap(1).toLower() == QLatin1String("name")) { 0264 name = pairRx.cap(2); 0265 } else if(pairRx.cap(1).toLower() == QLatin1String("value")) { 0266 value = pairRx.cap(2); 0267 } 0268 } 0269 if(!name.isEmpty() && !value.isEmpty()) { 0270 nameValues.insert(name, value); 0271 } 0272 } 0273 QString newUrl = QLatin1String(SCHOLAR_SET_BIBTEX_URL); 0274 for(QHash<QString, QString>::const_iterator i = nameValues.constBegin(); i != nameValues.constEnd(); ++i) { 0275 newUrl += QLatin1Char('&') + i.key() + QLatin1Char('=') + i.value(); 0276 } 0277 FileHandler::readTextFile(QUrl(newUrl), true); 0278 m_cookieIsSet = true; 0279 } 0280 0281 GoogleScholarFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const GoogleScholarFetcher* /*=0*/) 0282 : Fetch::ConfigWidget(parent_) { 0283 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0284 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0285 l->addStretch(); 0286 } 0287 0288 QString GoogleScholarFetcher::ConfigWidget::preferredName() const { 0289 return GoogleScholarFetcher::defaultName(); 0290 }