File indexing completed on 2024-05-19 05:05:45

0001 /***************************************************************************
0002  *   SPDX-License-Identifier: GPL-2.0-or-later
0003  *                                                                         *
0004  *   SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  *   This program is distributed in the hope that it will be useful,       *
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0014  *   GNU General Public License for more details.                          *
0015  *                                                                         *
0016  *   You should have received a copy of the GNU General Public License     *
0017  *   along with this program; if not, see <https://www.gnu.org/licenses/>. *
0018  ***************************************************************************/
0019 
0020 #include "urlchecker.h"
0021 
0022 #include <QTimer>
0023 #include <QSharedPointer>
0024 #include <QNetworkReply>
0025 #include <QRegularExpression>
0026 #include <QAtomicInteger>
0027 
0028 #include <Entry>
0029 #include <FileInfo>
0030 #include "internalnetworkaccessmanager.h"
0031 #include "logging_networking.h"
0032 
0033 class UrlChecker::Private
0034 {
0035 private:
0036     UrlChecker *p;
0037 
0038 public:
0039     QAtomicInteger<int> busyCounter;
0040     QSet<QUrl> urlsToCheck;
0041 
0042     Private(UrlChecker *parent)
0043             : p(parent)
0044     {
0045         /// nothing
0046     }
0047 
0048     void queueMoreOrFinish()
0049     {
0050         if (
0051 #if QT_VERSION >= 0x050e00
0052             busyCounter.loadRelaxed() <= 0 ///< This function was introduced in Qt 5.14.
0053 #else // QT_VERSION < 0x050e00
0054             busyCounter.load() <= 0
0055 #endif // QT_VERSION >= 0x050e00
0056             && urlsToCheck.isEmpty()) {
0057             /// In case there are no running checks and the queue of URLs to check is empty,
0058             /// wait for a brief moment of time, then fire a 'finished' signal.
0059             QTimer::singleShot(100, p, [this]() {
0060                 if (
0061 #if QT_VERSION >= 0x050e00
0062                     busyCounter.loadRelaxed() <= 0 ///< This function was introduced in Qt 5.14.
0063 #else // QT_VERSION < 0x050e00
0064                     busyCounter.load() <= 0
0065 #endif // QT_VERSION >= 0x050e00
0066                     && urlsToCheck.isEmpty())
0067 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0068                     QMetaObject::invokeMethod(p, "finished", Qt::DirectConnection, QGenericReturnArgument());
0069 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0070                     QMetaObject::invokeMethod(p, "finished", Qt::DirectConnection, QMetaMethodReturnArgument());
0071 #endif
0072                 else
0073                     /// It should not happen that when this timer is triggered the original condition is violated
0074                     qCCritical(LOG_KBIBTEX_NETWORKING) << "This cannot happen:" <<
0075 #if QT_VERSION >= 0x050e00
0076                                                        busyCounter.loadRelaxed()
0077 #else // QT_VERSION < 0x050e00
0078                                                        busyCounter.load()
0079 #endif // QT_VERSION >= 0x050e00
0080                                                        << urlsToCheck.count();
0081             });
0082         } else {
0083             /// Initiate as many checks as possible
0084             while (!urlsToCheck.isEmpty() &&
0085 #if QT_VERSION >= 0x050e00
0086                     busyCounter.loadRelaxed() <= 4 ///< This function was introduced in Qt 5.14.
0087 #else // QT_VERSION < 0x050e00
0088                     busyCounter.load() <= 4
0089 #endif // QT_VERSION >= 0x050e00
0090                   )
0091                 checkNextUrl();
0092         }
0093     }
0094 
0095     void checkNextUrl()
0096     {
0097         /// Immediately return if there are no URLs to check
0098         if (urlsToCheck.isEmpty()) return;
0099         /// Pop one URL from set of URLS to check
0100         auto firstUrlIt = urlsToCheck.begin();
0101         const QUrl url = *firstUrlIt;
0102         urlsToCheck.erase(firstUrlIt);
0103 
0104         QNetworkRequest request(url);
0105         request.setAttribute(QNetworkRequest::RedirectPolicyAttribute, QNetworkRequest::NoLessSafeRedirectPolicy);
0106         QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0107         busyCounter.ref();
0108         QObject::connect(reply, &QNetworkReply::finished, p, [this, reply]() {
0109             const QUrl url = reply->url();
0110             if (reply->error() != QNetworkReply::NoError) {
0111                 /// Instead of an 'emit' ...
0112 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0113                 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::NetworkError), Q_ARG(QString, reply->errorString()));
0114 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0115                 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::NetworkError), Q_ARG(QString, reply->errorString()));
0116 #endif
0117                 qCWarning(LOG_KBIBTEX_NETWORKING) << "NetworkError:" << reply->errorString() << url.toDisplayString();
0118             } else {
0119                 const QByteArray data = reply->read(1024);
0120                 if (data.isEmpty()) {
0121                     /// Instead of an 'emit' ...
0122 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0123                     QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnknownError), Q_ARG(QString, QStringLiteral("No data received")));
0124 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0125                     QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnknownError), Q_ARG(QString, QStringLiteral("No data received")));
0126 #endif
0127                     qCWarning(LOG_KBIBTEX_NETWORKING) << "UnknownError: No data received" << url.toDisplayString();
0128                 } else {
0129                     const QString filename = url.fileName().toLower();
0130                     const bool filenameSuggestsHTML = filename.isEmpty() || filename.endsWith(QStringLiteral(".html")) || filename.endsWith(QStringLiteral(".htm"));
0131                     const bool filenameSuggestsPDF =  filename.endsWith(QStringLiteral(".pdf"));
0132                     const bool filenameSuggestsPostScript =  filename.endsWith(QStringLiteral(".ps"));
0133                     const bool containsHTML = data.contains("<!DOCTYPE HTML") || data.contains("<html") || data.contains("<HTML") || data.contains("<body") || data.contains("<BODY");
0134                     const bool containsPDF = data.startsWith("%PDF");
0135                     const bool containsPostScript = data.startsWith("%!");
0136                     if (filenameSuggestsPDF && containsPDF) {
0137                         /// Instead of an 'emit' ...
0138 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0139                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0140 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0141                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0142 #endif
0143                         qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Looks and smells like a PDF" << url.toDisplayString();
0144                     } else if (filenameSuggestsPostScript && containsPostScript) {
0145                         /// Instead of an 'emit' ...
0146 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0147                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0148 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0149                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0150 #endif
0151                         qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Looks and smells like a PostScript" << url.toDisplayString();
0152                     } else if (containsHTML) {
0153                         static const QRegularExpression error40X(QStringLiteral("\\b(40\\d)\\b"));
0154                         const QRegularExpressionMatch error40Xmatch = error40X.match(QString::fromUtf8(data));
0155                         if (error40Xmatch.hasMatch()) {
0156                             /// Instead of an 'emit' ...
0157 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0158                             QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::Error40X), Q_ARG(QString, QString(QStringLiteral("Got error %1")).arg(error40Xmatch.captured(1))));
0159 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0160                             QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::Error40X), Q_ARG(QString, QString(QStringLiteral("Got error %1")).arg(error40Xmatch.captured(1))));
0161 #endif
0162                             qCWarning(LOG_KBIBTEX_NETWORKING) << "Error" << error40Xmatch.captured(1) << "in" << url.toDisplayString();
0163                         } else if (filenameSuggestsHTML) {
0164                             /// Instead of an 'emit' ...
0165 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0166                             QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0167 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0168                             QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0169 #endif
0170                             qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Looks and smells like a HTML" << url.toDisplayString();
0171                         } else {
0172                             /// Instead of an 'emit' ...
0173 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0174                             QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content")));
0175 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0176                             QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content")));
0177 #endif
0178                             qCWarning(LOG_KBIBTEX_NETWORKING) << "NotExpectedFileType (HTML): Filename's extension does not match content" << url.toDisplayString();
0179                         }
0180                     } else if (filenameSuggestsPDF != containsPDF) {
0181                         /// Instead of an 'emit' ...
0182 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0183                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content")));
0184 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0185                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content")));
0186 #endif
0187                         qCWarning(LOG_KBIBTEX_NETWORKING) << "NotExpectedFileType (PDF): Filename's extension does not match content" << url.toDisplayString();
0188                     } else if (filenameSuggestsPostScript != containsPostScript) {
0189                         /// Instead of an 'emit' ...
0190 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0191                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content")));
0192 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0193                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content")));
0194 #endif
0195                         qCWarning(LOG_KBIBTEX_NETWORKING) << "NotExpectedFileType (PostScript): Filename's extension does not match content" << url.toDisplayString();
0196                     } else {
0197                         /// Instead of an 'emit' ...
0198 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0199                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0200 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0201                         QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString()));
0202 #endif
0203                         qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Cannot see any issued with this URL" << url.toDisplayString();
0204                     }
0205                 }
0206             }
0207             busyCounter.deref();
0208             queueMoreOrFinish();
0209         });
0210     }
0211 };
0212 
0213 UrlChecker::UrlChecker(QObject *parent)
0214         : QObject(parent), d(new Private(this))
0215 {
0216     /// nothing
0217 }
0218 
0219 UrlChecker::~UrlChecker()
0220 {
0221     delete d;
0222 }
0223 
0224 void UrlChecker::startChecking(const File &bibtexFile)
0225 {
0226     if (bibtexFile.count() < 1) {
0227         /// Nothing to do for empty bibliographies
0228         QTimer::singleShot(100, this, [this]() {
0229             Q_EMIT finished();
0230         });
0231         return;
0232     }
0233 
0234     for (const QSharedPointer<Element> &element : bibtexFile) {
0235         /// Process only entries, not comments, preambles or macros
0236         const QSharedPointer<Entry> entry = element.dynamicCast<Entry>();
0237         if (entry.isNull()) continue;
0238 
0239         /// Retrieve set of URLs per entry and add to set of URLS to be checked
0240         const QSet<QUrl> thisEntryUrls = FileInfo::entryUrls(entry, bibtexFile.property(File::Url).toUrl(), FileInfo::TestExistence::No);
0241         for (const QUrl &u : thisEntryUrls)
0242             d->urlsToCheck.insert(u); ///< better?
0243     }
0244 
0245     if (d->urlsToCheck.isEmpty()) {
0246         /// No URLs identified in bibliography, so nothing to do
0247         QTimer::singleShot(100, this, [this]() {
0248             Q_EMIT finished();
0249         });
0250         return;
0251     }
0252 
0253     d->queueMoreOrFinish();
0254 }