File indexing completed on 2024-11-24 04:34:30

0001 /***************************************************************************
0002  *   SPDX-License-Identifier: GPL-2.0-or-later
0003  *                                                                         *
0004  *   SPDX-FileCopyrightText: 2016-2020 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  *   This program is distributed in the hope that it will be useful,       *
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0014  *   GNU General Public License for more details.                          *
0015  *                                                                         *
0016  *   You should have received a copy of the GNU General Public License     *
0017  *   along with this program; if not, see <https://www.gnu.org/licenses/>. *
0018  ***************************************************************************/
0019 
0020 #include "onlinesearchbiorxiv.h"
0021 
0022 #include <QNetworkRequest>
0023 #include <QNetworkReply>
0024 #include <QRegularExpression>
0025 
0026 #include <KLocalizedString>
0027 
0028 #include <FileImporterBibTeX>
0029 #include "internalnetworkaccessmanager.h"
0030 #include "logging_networking.h"
0031 
0032 class OnlineSearchBioRxiv::Private
0033 {
0034 public:
0035     QSet<QUrl> resultPageUrls;
0036 
0037     explicit Private(OnlineSearchBioRxiv *)
0038     {
0039         /// nothing
0040     }
0041 };
0042 
0043 OnlineSearchBioRxiv::OnlineSearchBioRxiv(QObject *parent)
0044         : OnlineSearchAbstract(parent), d(new OnlineSearchBioRxiv::Private(this))
0045 {
0046     /// nothing
0047 }
0048 
0049 OnlineSearchBioRxiv::~OnlineSearchBioRxiv() {
0050     delete d;
0051 }
0052 
0053 void OnlineSearchBioRxiv::startSearch(const QMap<QueryKey, QString> &query, int numResults) {
0054     m_hasBeenCanceled = false;
0055     d->resultPageUrls.clear();
0056     Q_EMIT progress(curStep = 0, numSteps = numResults * 2 + 1);
0057 
0058     QString urlText(QString(QStringLiteral("https://www.biorxiv.org/search/numresults:%1 sort:relevance-rank title_flags:match-phrase format_result:standard ")).arg(numResults));
0059     urlText.append(query[QueryKey::FreeText]);
0060 
0061     bool ok = false;
0062     int year = query[QueryKey::Year].toInt(&ok);
0063     if (ok && year >= 1800 && year < 2100)
0064         urlText.append(QString(QStringLiteral(" limit_from:%1-01-01 limit_to:%1-12-31")).arg(year));
0065 
0066     const QStringList authors = splitRespectingQuotationMarks(query[QueryKey::Author]);
0067     int authorIndex = 1;
0068     for (QStringList::ConstIterator it = authors.constBegin(); it != authors.constEnd(); ++it, ++authorIndex)
0069         urlText.append(QString(QStringLiteral(" author%1:%2")).arg(authorIndex).arg(QString(*it).replace(QStringLiteral(" "), QStringLiteral("+"))));
0070 
0071     const QString title = QString(query[QueryKey::Title]).replace(QStringLiteral(" "), QStringLiteral("+"));
0072     if (!title.isEmpty())
0073         urlText.append(QString(QStringLiteral(" title:%1")).arg(title));
0074 
0075     QNetworkRequest request(QUrl::fromUserInput(urlText));
0076     QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0077     InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply);
0078     connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultsPageDone);
0079 
0080     refreshBusyProperty();
0081 }
0082 
0083 QString OnlineSearchBioRxiv::label() const {
0084     return i18n("bioRxiv");
0085 }
0086 
0087 QUrl OnlineSearchBioRxiv::homepage() const {
0088     return QUrl(QStringLiteral("https://www.biorxiv.org/"));
0089 }
0090 
0091 void OnlineSearchBioRxiv::resultsPageDone() {
0092     Q_EMIT progress(++curStep, numSteps);
0093     QNetworkReply *reply = static_cast<QNetworkReply *>(sender());
0094 
0095     if (handleErrors(reply)) {
0096         /// ensure proper treatment of UTF-8 characters
0097         const QString htmlCode = QString::fromUtf8(reply->readAll().constData());
0098 
0099         static const QRegularExpression contentRegExp(QStringLiteral("[^\"]*/content/(early/[12]\\d{3}/[01]\\d/\\d{2}/\\d+|(") + KBibTeX::doiRegExp.pattern() + QStringLiteral("))"));
0100         QRegularExpressionMatchIterator contentRegExpMatchIt = contentRegExp.globalMatch(htmlCode);
0101         while (contentRegExpMatchIt.hasNext()) {
0102             const QRegularExpressionMatch contentRegExpMatch = contentRegExpMatchIt.next();
0103             const QUrl url = QUrl(QStringLiteral("https://www.biorxiv.org") + contentRegExpMatch.captured(0));
0104             d->resultPageUrls.insert(url);
0105         }
0106 
0107         if (d->resultPageUrls.isEmpty())
0108             stopSearch(resultNoError);
0109         else {
0110             const QUrl firstUrl = *d->resultPageUrls.constBegin();
0111             d->resultPageUrls.remove(firstUrl);
0112             QNetworkRequest request(firstUrl);
0113             QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0114             InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply);
0115             connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultPageDone);
0116         }
0117     }
0118 
0119     refreshBusyProperty();
0120 }
0121 
0122 void OnlineSearchBioRxiv::resultPageDone() {
0123     Q_EMIT progress(++curStep, numSteps);
0124     QNetworkReply *reply = static_cast<QNetworkReply *>(sender());
0125 
0126     if (handleErrors(reply)) {
0127         /// ensure proper treatment of UTF-8 characters
0128         const QString htmlCode = QString::fromUtf8(reply->readAll().constData());
0129 
0130         static const QRegularExpression highwireRegExp(QStringLiteral("/highwire/citation/\\d+/bibtext"));
0131         const QRegularExpressionMatch highwireRegExpMatch = highwireRegExp.match(htmlCode);
0132         if (highwireRegExpMatch.hasMatch()) {
0133             const QUrl url = QUrl(QStringLiteral("https://www.biorxiv.org") + highwireRegExpMatch.captured(0));
0134             QNetworkRequest request(url);
0135             QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0136             InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply);
0137             connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::bibTeXDownloadDone);
0138         } else if (!d->resultPageUrls.isEmpty()) {
0139             const QUrl firstUrl = *d->resultPageUrls.constBegin();
0140             d->resultPageUrls.remove(firstUrl);
0141             QNetworkRequest request(firstUrl);
0142             QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0143             InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply);
0144             connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultPageDone);
0145         } else
0146             stopSearch(resultNoError);
0147     }
0148 
0149     refreshBusyProperty();
0150 }
0151 
0152 
0153 void OnlineSearchBioRxiv::bibTeXDownloadDone() {
0154     Q_EMIT progress(++curStep, numSteps);
0155     QNetworkReply *reply = static_cast<QNetworkReply *>(sender());
0156 
0157     if (handleErrors(reply)) {
0158         /// ensure proper treatment of UTF-8 characters
0159         const QString bibTeXcode = QString::fromUtf8(reply->readAll().constData());
0160 
0161         if (!bibTeXcode.isEmpty()) {
0162             FileImporterBibTeX importer(this);
0163             File *bibtexFile = importer.fromString(bibTeXcode);
0164 
0165             if (bibtexFile != nullptr) {
0166                 for (const auto &element : const_cast<const File &>(*bibtexFile)) {
0167                     QSharedPointer<Entry> entry = element.dynamicCast<Entry>();
0168                     publishEntry(entry);
0169                 }
0170 
0171                 delete bibtexFile;
0172             } else {
0173                 qCWarning(LOG_KBIBTEX_NETWORKING) << "No valid BibTeX file results returned on request on" << InternalNetworkAccessManager::removeApiKey(reply->url()).toDisplayString();
0174             }
0175         }
0176     }
0177 
0178     if (d->resultPageUrls.isEmpty())
0179         stopSearch(resultNoError);
0180     else {
0181         const QUrl firstUrl = *d->resultPageUrls.constBegin();
0182         d->resultPageUrls.remove(firstUrl);
0183         QNetworkRequest request(firstUrl);
0184         QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0185         InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply);
0186         connect(reply, &QNetworkReply::finished, this, &OnlineSearchBioRxiv::resultPageDone);
0187     }
0188 
0189     refreshBusyProperty();
0190 }