File indexing completed on 2024-11-24 04:34:30
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2016-2020 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "onlinesearchbiorxiv.h" 0021 0022 #include <QNetworkRequest> 0023 #include <QNetworkReply> 0024 #include <QRegularExpression> 0025 0026 #include <KLocalizedString> 0027 0028 #include <FileImporterBibTeX> 0029 #include "internalnetworkaccessmanager.h" 0030 #include "logging_networking.h" 0031 0032 class OnlineSearchBioRxiv::Private 0033 { 0034 public: 0035 QSet<QUrl> resultPageUrls; 0036 0037 explicit Private(OnlineSearchBioRxiv *) 0038 { 0039 /// nothing 0040 } 0041 }; 0042 0043 OnlineSearchBioRxiv::OnlineSearchBioRxiv(QObject *parent) 0044 : OnlineSearchAbstract(parent), d(new OnlineSearchBioRxiv::Private(this)) 0045 { 0046 /// nothing 0047 } 0048 0049 OnlineSearchBioRxiv::~OnlineSearchBioRxiv() { 0050 delete d; 0051 } 0052 0053 void OnlineSearchBioRxiv::startSearch(const QMap<QueryKey, QString> &query, int numResults) { 0054 m_hasBeenCanceled = false; 0055 d->resultPageUrls.clear(); 0056 Q_EMIT progress(curStep = 0, numSteps = numResults * 2 + 1); 0057 0058 QString urlText(QString(QStringLiteral("https://www.biorxiv.org/search/numresults:%1 sort:relevance-rank title_flags:match-phrase format_result:standard ")).arg(numResults)); 0059 urlText.append(query[QueryKey::FreeText]); 0060 0061 bool ok = false; 0062 int year = query[QueryKey::Year].toInt(&ok); 0063 if (ok && year >= 1800 && year < 2100) 0064 urlText.append(QString(QStringLiteral(" limit_from:%1-01-01 limit_to:%1-12-31")).arg(year)); 0065 0066 const QStringList authors = splitRespectingQuotationMarks(query[QueryKey::Author]); 0067 int authorIndex = 1; 0068 for (QStringList::ConstIterator it = authors.constBegin(); it != authors.constEnd(); ++it, ++authorIndex) 0069 urlText.append(QString(QStringLiteral(" author%1:%2")).arg(authorIndex).arg(QString(*it).replace(QStringLiteral(" "), QStringLiteral("+")))); 0070 0071 const QString title = QString(query[QueryKey::Title]).replace(QStringLiteral(" "), QStringLiteral("+")); 0072 if (!title.isEmpty()) 0073 urlText.append(QString(QStringLiteral(" title:%1")).arg(title)); 0074 0075 QNetworkRequest request(QUrl::fromUserInput(urlText)); 0076 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0077 InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply); 0078 connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultsPageDone); 0079 0080 refreshBusyProperty(); 0081 } 0082 0083 QString OnlineSearchBioRxiv::label() const { 0084 return i18n("bioRxiv"); 0085 } 0086 0087 QUrl OnlineSearchBioRxiv::homepage() const { 0088 return QUrl(QStringLiteral("https://www.biorxiv.org/")); 0089 } 0090 0091 void OnlineSearchBioRxiv::resultsPageDone() { 0092 Q_EMIT progress(++curStep, numSteps); 0093 QNetworkReply *reply = static_cast<QNetworkReply *>(sender()); 0094 0095 if (handleErrors(reply)) { 0096 /// ensure proper treatment of UTF-8 characters 0097 const QString htmlCode = QString::fromUtf8(reply->readAll().constData()); 0098 0099 static const QRegularExpression contentRegExp(QStringLiteral("[^\"]*/content/(early/[12]\\d{3}/[01]\\d/\\d{2}/\\d+|(") + KBibTeX::doiRegExp.pattern() + QStringLiteral("))")); 0100 QRegularExpressionMatchIterator contentRegExpMatchIt = contentRegExp.globalMatch(htmlCode); 0101 while (contentRegExpMatchIt.hasNext()) { 0102 const QRegularExpressionMatch contentRegExpMatch = contentRegExpMatchIt.next(); 0103 const QUrl url = QUrl(QStringLiteral("https://www.biorxiv.org") + contentRegExpMatch.captured(0)); 0104 d->resultPageUrls.insert(url); 0105 } 0106 0107 if (d->resultPageUrls.isEmpty()) 0108 stopSearch(resultNoError); 0109 else { 0110 const QUrl firstUrl = *d->resultPageUrls.constBegin(); 0111 d->resultPageUrls.remove(firstUrl); 0112 QNetworkRequest request(firstUrl); 0113 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0114 InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply); 0115 connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultPageDone); 0116 } 0117 } 0118 0119 refreshBusyProperty(); 0120 } 0121 0122 void OnlineSearchBioRxiv::resultPageDone() { 0123 Q_EMIT progress(++curStep, numSteps); 0124 QNetworkReply *reply = static_cast<QNetworkReply *>(sender()); 0125 0126 if (handleErrors(reply)) { 0127 /// ensure proper treatment of UTF-8 characters 0128 const QString htmlCode = QString::fromUtf8(reply->readAll().constData()); 0129 0130 static const QRegularExpression highwireRegExp(QStringLiteral("/highwire/citation/\\d+/bibtext")); 0131 const QRegularExpressionMatch highwireRegExpMatch = highwireRegExp.match(htmlCode); 0132 if (highwireRegExpMatch.hasMatch()) { 0133 const QUrl url = QUrl(QStringLiteral("https://www.biorxiv.org") + highwireRegExpMatch.captured(0)); 0134 QNetworkRequest request(url); 0135 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0136 InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply); 0137 connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::bibTeXDownloadDone); 0138 } else if (!d->resultPageUrls.isEmpty()) { 0139 const QUrl firstUrl = *d->resultPageUrls.constBegin(); 0140 d->resultPageUrls.remove(firstUrl); 0141 QNetworkRequest request(firstUrl); 0142 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0143 InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply); 0144 connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultPageDone); 0145 } else 0146 stopSearch(resultNoError); 0147 } 0148 0149 refreshBusyProperty(); 0150 } 0151 0152 0153 void OnlineSearchBioRxiv::bibTeXDownloadDone() { 0154 Q_EMIT progress(++curStep, numSteps); 0155 QNetworkReply *reply = static_cast<QNetworkReply *>(sender()); 0156 0157 if (handleErrors(reply)) { 0158 /// ensure proper treatment of UTF-8 characters 0159 const QString bibTeXcode = QString::fromUtf8(reply->readAll().constData()); 0160 0161 if (!bibTeXcode.isEmpty()) { 0162 FileImporterBibTeX importer(this); 0163 File *bibtexFile = importer.fromString(bibTeXcode); 0164 0165 if (bibtexFile != nullptr) { 0166 for (const auto &element : const_cast<const File &>(*bibtexFile)) { 0167 QSharedPointer<Entry> entry = element.dynamicCast<Entry>(); 0168 publishEntry(entry); 0169 } 0170 0171 delete bibtexFile; 0172 } else { 0173 qCWarning(LOG_KBIBTEX_NETWORKING) << "No valid BibTeX file results returned on request on" << InternalNetworkAccessManager::removeApiKey(reply->url()).toDisplayString(); 0174 } 0175 } 0176 } 0177 0178 if (d->resultPageUrls.isEmpty()) 0179 stopSearch(resultNoError); 0180 else { 0181 const QUrl firstUrl = *d->resultPageUrls.constBegin(); 0182 d->resultPageUrls.remove(firstUrl); 0183 QNetworkRequest request(firstUrl); 0184 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0185 InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply); 0186 connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultPageDone); 0187 } 0188 0189 refreshBusyProperty(); 0190 }