File indexing completed on 2024-05-19 05:05:45
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "urlchecker.h" 0021 0022 #include <QTimer> 0023 #include <QSharedPointer> 0024 #include <QNetworkReply> 0025 #include <QRegularExpression> 0026 #include <QAtomicInteger> 0027 0028 #include <Entry> 0029 #include <FileInfo> 0030 #include "internalnetworkaccessmanager.h" 0031 #include "logging_networking.h" 0032 0033 class UrlChecker::Private 0034 { 0035 private: 0036 UrlChecker *p; 0037 0038 public: 0039 QAtomicInteger<int> busyCounter; 0040 QSet<QUrl> urlsToCheck; 0041 0042 Private(UrlChecker *parent) 0043 : p(parent) 0044 { 0045 /// nothing 0046 } 0047 0048 void queueMoreOrFinish() 0049 { 0050 if ( 0051 #if QT_VERSION >= 0x050e00 0052 busyCounter.loadRelaxed() <= 0 ///< This function was introduced in Qt 5.14. 0053 #else // QT_VERSION < 0x050e00 0054 busyCounter.load() <= 0 0055 #endif // QT_VERSION >= 0x050e00 0056 && urlsToCheck.isEmpty()) { 0057 /// In case there are no running checks and the queue of URLs to check is empty, 0058 /// wait for a brief moment of time, then fire a 'finished' signal. 0059 QTimer::singleShot(100, p, [this]() { 0060 if ( 0061 #if QT_VERSION >= 0x050e00 0062 busyCounter.loadRelaxed() <= 0 ///< This function was introduced in Qt 5.14. 0063 #else // QT_VERSION < 0x050e00 0064 busyCounter.load() <= 0 0065 #endif // QT_VERSION >= 0x050e00 0066 && urlsToCheck.isEmpty()) 0067 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0068 QMetaObject::invokeMethod(p, "finished", Qt::DirectConnection, QGenericReturnArgument()); 0069 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0070 QMetaObject::invokeMethod(p, "finished", Qt::DirectConnection, QMetaMethodReturnArgument()); 0071 #endif 0072 else 0073 /// It should not happen that when this timer is triggered the original condition is violated 0074 qCCritical(LOG_KBIBTEX_NETWORKING) << "This cannot happen:" << 0075 #if QT_VERSION >= 0x050e00 0076 busyCounter.loadRelaxed() 0077 #else // QT_VERSION < 0x050e00 0078 busyCounter.load() 0079 #endif // QT_VERSION >= 0x050e00 0080 << urlsToCheck.count(); 0081 }); 0082 } else { 0083 /// Initiate as many checks as possible 0084 while (!urlsToCheck.isEmpty() && 0085 #if QT_VERSION >= 0x050e00 0086 busyCounter.loadRelaxed() <= 4 ///< This function was introduced in Qt 5.14. 0087 #else // QT_VERSION < 0x050e00 0088 busyCounter.load() <= 4 0089 #endif // QT_VERSION >= 0x050e00 0090 ) 0091 checkNextUrl(); 0092 } 0093 } 0094 0095 void checkNextUrl() 0096 { 0097 /// Immediately return if there are no URLs to check 0098 if (urlsToCheck.isEmpty()) return; 0099 /// Pop one URL from set of URLS to check 0100 auto firstUrlIt = urlsToCheck.begin(); 0101 const QUrl url = *firstUrlIt; 0102 urlsToCheck.erase(firstUrlIt); 0103 0104 QNetworkRequest request(url); 0105 request.setAttribute(QNetworkRequest::RedirectPolicyAttribute, QNetworkRequest::NoLessSafeRedirectPolicy); 0106 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0107 busyCounter.ref(); 0108 QObject::connect(reply, &QNetworkReply::finished, p, [this, reply]() { 0109 const QUrl url = reply->url(); 0110 if (reply->error() != QNetworkReply::NoError) { 0111 /// Instead of an 'emit' ... 0112 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0113 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::NetworkError), Q_ARG(QString, reply->errorString())); 0114 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0115 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::NetworkError), Q_ARG(QString, reply->errorString())); 0116 #endif 0117 qCWarning(LOG_KBIBTEX_NETWORKING) << "NetworkError:" << reply->errorString() << url.toDisplayString(); 0118 } else { 0119 const QByteArray data = reply->read(1024); 0120 if (data.isEmpty()) { 0121 /// Instead of an 'emit' ... 0122 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0123 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnknownError), Q_ARG(QString, QStringLiteral("No data received"))); 0124 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0125 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnknownError), Q_ARG(QString, QStringLiteral("No data received"))); 0126 #endif 0127 qCWarning(LOG_KBIBTEX_NETWORKING) << "UnknownError: No data received" << url.toDisplayString(); 0128 } else { 0129 const QString filename = url.fileName().toLower(); 0130 const bool filenameSuggestsHTML = filename.isEmpty() || filename.endsWith(QStringLiteral(".html")) || filename.endsWith(QStringLiteral(".htm")); 0131 const bool filenameSuggestsPDF = filename.endsWith(QStringLiteral(".pdf")); 0132 const bool filenameSuggestsPostScript = filename.endsWith(QStringLiteral(".ps")); 0133 const bool containsHTML = data.contains("<!DOCTYPE HTML") || data.contains("<html") || data.contains("<HTML") || data.contains("<body") || data.contains("<BODY"); 0134 const bool containsPDF = data.startsWith("%PDF"); 0135 const bool containsPostScript = data.startsWith("%!"); 0136 if (filenameSuggestsPDF && containsPDF) { 0137 /// Instead of an 'emit' ... 0138 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0139 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0140 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0141 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0142 #endif 0143 qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Looks and smells like a PDF" << url.toDisplayString(); 0144 } else if (filenameSuggestsPostScript && containsPostScript) { 0145 /// Instead of an 'emit' ... 0146 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0147 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0148 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0149 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0150 #endif 0151 qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Looks and smells like a PostScript" << url.toDisplayString(); 0152 } else if (containsHTML) { 0153 static const QRegularExpression error40X(QStringLiteral("\\b(40\\d)\\b")); 0154 const QRegularExpressionMatch error40Xmatch = error40X.match(QString::fromUtf8(data)); 0155 if (error40Xmatch.hasMatch()) { 0156 /// Instead of an 'emit' ... 0157 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0158 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::Error40X), Q_ARG(QString, QString(QStringLiteral("Got error %1")).arg(error40Xmatch.captured(1)))); 0159 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0160 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::Error40X), Q_ARG(QString, QString(QStringLiteral("Got error %1")).arg(error40Xmatch.captured(1)))); 0161 #endif 0162 qCWarning(LOG_KBIBTEX_NETWORKING) << "Error" << error40Xmatch.captured(1) << "in" << url.toDisplayString(); 0163 } else if (filenameSuggestsHTML) { 0164 /// Instead of an 'emit' ... 0165 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0166 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0167 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0168 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0169 #endif 0170 qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Looks and smells like a HTML" << url.toDisplayString(); 0171 } else { 0172 /// Instead of an 'emit' ... 0173 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0174 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content"))); 0175 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0176 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content"))); 0177 #endif 0178 qCWarning(LOG_KBIBTEX_NETWORKING) << "NotExpectedFileType (HTML): Filename's extension does not match content" << url.toDisplayString(); 0179 } 0180 } else if (filenameSuggestsPDF != containsPDF) { 0181 /// Instead of an 'emit' ... 0182 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0183 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content"))); 0184 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0185 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content"))); 0186 #endif 0187 qCWarning(LOG_KBIBTEX_NETWORKING) << "NotExpectedFileType (PDF): Filename's extension does not match content" << url.toDisplayString(); 0188 } else if (filenameSuggestsPostScript != containsPostScript) { 0189 /// Instead of an 'emit' ... 0190 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0191 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content"))); 0192 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0193 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UnexpectedFileType), Q_ARG(QString, QStringLiteral("Filename's extension does not match content"))); 0194 #endif 0195 qCWarning(LOG_KBIBTEX_NETWORKING) << "NotExpectedFileType (PostScript): Filename's extension does not match content" << url.toDisplayString(); 0196 } else { 0197 /// Instead of an 'emit' ... 0198 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0199 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0200 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0201 QMetaObject::invokeMethod(p, "urlChecked", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(QUrl, url), Q_ARG(UrlChecker::Status, UrlChecker::Status::UrlValid), Q_ARG(QString, QString())); 0202 #endif 0203 qCWarning(LOG_KBIBTEX_NETWORKING) << "UrlValid: Cannot see any issued with this URL" << url.toDisplayString(); 0204 } 0205 } 0206 } 0207 busyCounter.deref(); 0208 queueMoreOrFinish(); 0209 }); 0210 } 0211 }; 0212 0213 UrlChecker::UrlChecker(QObject *parent) 0214 : QObject(parent), d(new Private(this)) 0215 { 0216 /// nothing 0217 } 0218 0219 UrlChecker::~UrlChecker() 0220 { 0221 delete d; 0222 } 0223 0224 void UrlChecker::startChecking(const File &bibtexFile) 0225 { 0226 if (bibtexFile.count() < 1) { 0227 /// Nothing to do for empty bibliographies 0228 QTimer::singleShot(100, this, [this]() { 0229 Q_EMIT finished(); 0230 }); 0231 return; 0232 } 0233 0234 for (const QSharedPointer<Element> &element : bibtexFile) { 0235 /// Process only entries, not comments, preambles or macros 0236 const QSharedPointer<Entry> entry = element.dynamicCast<Entry>(); 0237 if (entry.isNull()) continue; 0238 0239 /// Retrieve set of URLs per entry and add to set of URLS to be checked 0240 const QSet<QUrl> thisEntryUrls = FileInfo::entryUrls(entry, bibtexFile.property(File::Url).toUrl(), FileInfo::TestExistence::No); 0241 for (const QUrl &u : thisEntryUrls) 0242 d->urlsToCheck.insert(u); ///< better? 0243 } 0244 0245 if (d->urlsToCheck.isEmpty()) { 0246 /// No URLs identified in bibliography, so nothing to do 0247 QTimer::singleShot(100, this, [this]() { 0248 Q_EMIT finished(); 0249 }); 0250 return; 0251 } 0252 0253 d->queueMoreOrFinish(); 0254 }