src/networking/findpdf.cpp

0001 /***************************************************************************
0002  *   SPDX-License-Identifier: GPL-2.0-or-later
0003  *                                                                         *
0004  *   SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  *   This program is distributed in the hope that it will be useful,       *
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0014  *   GNU General Public License for more details.                          *
0015  *                                                                         *
0016  *   You should have received a copy of the GNU General Public License     *
0017  *   along with this program; if not, see <https://www.gnu.org/licenses/>. *
0018  ***************************************************************************/
0019
0020 #include "findpdf.h"
0021
0022 #include <QNetworkReply>
0023 #include <QNetworkRequest>
0024 #include <QRegularExpression>
0025 #include <QApplication>
0026 #include <QTemporaryFile>
0027 #include <QUrlQuery>
0028 #include <QStandardPaths>
0029 #include <QDir>
0030 #ifdef HAVE_SCIHUB
0031 #include <QRandomGenerator>
0032 #endif // HAVE_SCIHUB
0033
0034 #ifdef HAVE_POPPLERQT5
0035 #include <poppler-qt5.h>
0036 #else // not HAVE_POPPLERQT5
0037 #ifdef HAVE_POPPLERQT6
0038 #include <poppler-qt6.h>
0039 #endif // HAVE_POPPLERQT6
0040 #endif // HAVE_POPPLERQT5
0041
0042 #include <KBibTeX>
0043 #include <Value>
0044 #include <FileInfo>
0045 #include "internalnetworkaccessmanager.h"
0046 #include "logging_networking.h"
0047
0048 static const int maxDepth = 5;
0049 static const char *depthProperty = "depth";
0050 static const char *termProperty = "term";
0051 static const char *originProperty = "origin";
0052
0053
0054 class FindPDF::Private
0055 {
0056 private:
0057     FindPDF *p;
0058
0059     /**
0060      * @brief Remove the fragment part of an URL (i.e. everything starting from '#')
0061      * @param u QUrl which may contain a fragment part or not
0062      * @return QUrl with the fragment part removed if there was any
0063      */
0064     inline QUrl removeFragment(const QUrl &u) {
0065         if (u.hasFragment()) {
0066             QUrl _u{u};
0067             _u.setFragment(QString());
0068             return _u;
0069         } else
0070             return u;
0071     }
0072
0073 public:
0074     int aliveCounter;
0075     QList<ResultItem> result;
0076     Entry currentEntry;
0077     QSet<QUrl> knownUrls;
0078     QSet<QNetworkReply *> runningDownloads;
0079
0080     Private(FindPDF *parent)
0081             : p(parent), aliveCounter(0)
0082     {
0083         /// nothing
0084     }
0085
0086     bool queueUrl(const QUrl &url, const QString &term, const QString &origin, int depth)
0087     {
0088         const QUrl sanitizedUrl{removeFragment(url)};
0089
0090         if (!knownUrls.contains(sanitizedUrl) && depth > 0) {
0091             knownUrls.insert(sanitizedUrl);
0092             QNetworkRequest request = QNetworkRequest(sanitizedUrl);
0093             QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request);
0094             InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply, 15); ///< set a timeout on network connections
0095             reply->setProperty(depthProperty, QVariant::fromValue<int>(depth));
0096             reply->setProperty(termProperty, term);
0097             reply->setProperty(originProperty, origin);
0098             runningDownloads.insert(reply);
0099             connect(reply, &QNetworkReply::finished, p, &FindPDF::downloadFinished);
0100             ++aliveCounter;
0101             return true;
0102         } else
0103             return false;
0104     }
0105
0106     void processGeneralHTML(QNetworkReply *reply, const QString &text)
0107     {
0108         /// fetch some properties from Reply object
0109         const QString term = reply->property(termProperty).toString();
0110         const QString origin = reply->property(originProperty).toString();
0111         const QUrl baseUrl{reply->url()};
0112         bool ok = false;
0113         int depth = reply->property(depthProperty).toInt(&ok);
0114         if (!ok) depth = 0;
0115
0116         /// regular expressions to guess links to follow
0117         const QVector<QRegularExpression> specificAnchorRegExp = {
0118             QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]*%1[^\"]*[.]pdf)\"")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption),
0119             QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]+)\"[^>]*>[^<]*%1[^<]*[.]pdf")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption),
0120             QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]*%1[^\"]*)\"")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption),
0121             QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]+)\"[^>]*>[^<]*%1[^<]*\\b")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption)
0122         };
0123         static const QRegularExpression genericAnchorRegExp = QRegularExpression(QStringLiteral("<a[^>]*href=\"([^\"]+)\""), QRegularExpression::CaseInsensitiveOption);
0124
0125         bool gotLink = false;
0126         for (const QRegularExpression &anchorRegExp : specificAnchorRegExp) {
0127             const QRegularExpressionMatch match = anchorRegExp.match(text);
0128             if (match.hasMatch()) {
0129                 const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1());
0130                 queueUrl(baseUrl.resolved(url), term, origin, depth - 1);
0131                 gotLink = true;
0132                 break;
0133             }
0134         }
0135
0136         if (!gotLink) {
0137             /// this is only the last resort:
0138             /// to follow the first link found in the HTML document
0139             const QRegularExpressionMatch match = genericAnchorRegExp.match(text);
0140             if (match.hasMatch()) {
0141                 const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1());
0142                 queueUrl(baseUrl.resolved(url), term, origin, depth - 1);
0143             }
0144         }
0145
0146         if (baseUrl.isValid()) {
0147             int p1 = text.indexOf(QStringLiteral("<embed"));
0148             while (p1 > 0) {
0149                 const int p2 = text.indexOf(QStringLiteral(">"), p1 + 5);
0150                 if (p2 > p1) {
0151                     const int p3 = text.indexOf(QStringLiteral("type=\"application/pdf\""), p1 + 5);
0152                     if (p3 > p1 && p3 < p2) {
0153                         const int p4 = text.indexOf(QStringLiteral("src=\""), p1 + 5);
0154                         if (p4 > p1 && p4 < p2) {
0155                             const int p5 = text.indexOf(QStringLiteral("\""), p4 + 5);
0156                             if (p5 > p4 && p5 < p2) {
0157                                 const QString src{text.mid(p4 + 5, p5 - p4)};
0158                                 QUrl nextUrl;
0159                                 if (src.startsWith(QStringLiteral("http")))
0160                                     nextUrl = QUrl::fromUserInput(src);
0161                                 else if (src.startsWith(QStringLiteral("//")))
0162                                     nextUrl = QUrl::fromUserInput(baseUrl.scheme() + QStringLiteral(":") + src);
0163                                 else if (src.startsWith(QStringLiteral("/")))
0164                                     nextUrl = QUrl::fromUserInput(baseUrl.scheme() + QStringLiteral("://") + baseUrl.host() + src);
0165                                 else
0166                                     nextUrl = baseUrl.resolved(QUrl(src));
0167                                 queueUrl(nextUrl, term, origin, depth - 1);
0168                             }
0169                         }
0170                     }
0171                     p1 = text.indexOf(QStringLiteral("<embed"), p2);
0172                 } else
0173                     break;
0174             }
0175         }
0176     }
0177
0178     void processGoogleResult(QNetworkReply *reply, const QString &text)
0179     {
0180         static const QString h3Tag(QStringLiteral("<h3"));
0181         static const QString aTag(QStringLiteral("<a"));
0182         static const QString hrefAttrib(QStringLiteral("href=\""));
0183
0184         const QString term = reply->property(termProperty).toString();
0185         bool ok = false;
0186         int depth = reply->property(depthProperty).toInt(&ok);
0187         if (!ok) depth = 0;
0188
0189         /// extract the first numHitsToFollow-many hits found by Google Scholar
0190         const int numHitsToFollow = 10;
0191         int p = -1;
0192         for (int i = 0; i < numHitsToFollow; ++i) {
0193             if ((p = text.indexOf(h3Tag, p + 1)) >= 0 && (p = text.indexOf(aTag, p + 1)) >= 0 && (p = text.indexOf(hrefAttrib, p + 1)) >= 0) {
0194                 int p1 = p + 6;
0195                 int p2 = text.indexOf(QLatin1Char('"'), p1 + 1);
0196                 QUrl url(text.mid(p1, p2 - p1));
0197                 const QString googleService = reply->url().host().contains(QStringLiteral("scholar.google")) ? QStringLiteral("scholar.google") : QStringLiteral("www.google");
0198                 queueUrl(reply->url().resolved(url), term, googleService, depth - 1);
0199             }
0200         }
0201     }
0202
0203     void processSpringerLink(QNetworkReply *reply, const QString &text)
0204     {
0205         static const QRegularExpression fulltextPDFlink(QStringLiteral("href=\"([^\"]+/fulltext.pdf)\""));
0206         const QRegularExpressionMatch match = fulltextPDFlink.match(text);
0207         if (match.hasMatch()) {
0208             bool ok = false;
0209             int depth = reply->property(depthProperty).toInt(&ok);
0210             if (!ok) depth = 0;
0211
0212             const QUrl url(match.captured(1));
0213             queueUrl(reply->url().resolved(url), QString(), QStringLiteral("springerlink"), depth - 1);
0214         }
0215     }
0216
0217     void processCiteSeerX(QNetworkReply *reply, const QString &text)
0218     {
0219         static const QRegularExpression downloadPDFlink(QStringLiteral("href=\"(/viewdoc/download[^\"]+type=pdf)\""));
0220         const QRegularExpressionMatch match = downloadPDFlink.match(text);
0221         if (match.hasMatch()) {
0222             bool ok = false;
0223             int depth = reply->property(depthProperty).toInt(&ok);
0224             if (!ok) depth = 0;
0225
0226             const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1());
0227             queueUrl(reply->url().resolved(url), QString(), QStringLiteral("citeseerx"), depth - 1);
0228         }
0229     }
0230
0231     void processACMDigitalLibrary(QNetworkReply *reply, const QString &text)
0232     {
0233         static const QRegularExpression downloadPDFlink(QStringLiteral("href=\"(ft_gateway.cfm\\?id=\\d+&ftid=\\d+&dwn=1&CFID=\\d+&CFTOKEN=\\d+)\""));
0234         const QRegularExpressionMatch match = downloadPDFlink.match(text);
0235         if (match.hasMatch()) {
0236             bool ok = false;
0237             int depth = reply->property(depthProperty).toInt(&ok);
0238             if (!ok) depth = 0;
0239
0240             const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1());
0241             queueUrl(reply->url().resolved(url), QString(), QStringLiteral("acmdl"), depth - 1);
0242         }
0243     }
0244
0245     bool processPDF(QNetworkReply *reply, const QByteArray &data)
0246     {
0247         bool progress = false;
0248         const QString origin = reply->property(originProperty).toString();
0249         const QUrl url = reply->url();
0250
0251         /// Search for duplicate URLs
0252         bool containsUrl = false;
0253         for (const ResultItem &ri :  const_cast<const QList<ResultItem> &>(result)) {
0254             containsUrl |= ri.url == url;
0255             /// Skip already visited URLs
0256             if (containsUrl) break;
0257         }
0258
0259         if (!containsUrl) {
0260 #ifdef HAVE_POPPLERQT5
0261             QScopedPointer<Poppler::Document> doc(Poppler::Document::loadFromData(data));
0262 #else // not HAVE_POPPLERQT5
0263 #ifdef HAVE_POPPLERQT6
0264             std::unique_ptr<Poppler::Document> doc = Poppler::Document::loadFromData(data);
0265 #endif // HAVE_POPPLERQT6
0266 #endif // HAVE_POPPLERQT5
0267
0268             ResultItem resultItem;
0269             resultItem.tempFilename = new QTemporaryFile(QStandardPaths::writableLocation(QStandardPaths::TempLocation) + QDir::separator() + QStringLiteral("kbibtex_findpdf_XXXXXX.pdf"));
0270             resultItem.tempFilename->setAutoRemove(true);
0271             if (resultItem.tempFilename->open()) {
0272                 const int lenDataWritten = static_cast<int>(resultItem.tempFilename->write(data));
0273                 resultItem.tempFilename->close();
0274                 if (lenDataWritten != data.length()) {
0275                     /// Failed to write to temporary file
0276                     qCWarning(LOG_KBIBTEX_NETWORKING) << "Failed to write to temporary file for filename" << resultItem.tempFilename->fileName();
0277                     delete resultItem.tempFilename;
0278                     resultItem.tempFilename = nullptr;
0279                 }
0280             } else {
0281                 /// Failed to create temporary file
0282                 qCWarning(LOG_KBIBTEX_NETWORKING) << "Failed to create temporary file for templaet" << resultItem.tempFilename->fileTemplate();
0283                 delete resultItem.tempFilename;
0284                 resultItem.tempFilename = nullptr;
0285             }
0286             resultItem.url = url;
0287             resultItem.textPreview = doc->info(QStringLiteral("Title")).simplified();
0288             static const int maxTextLen = 1024;
0289             for (int i = 0; i < doc->numPages() && resultItem.textPreview.length() < maxTextLen; ++i) {
0290 #ifdef HAVE_POPPLERQT5
0291                 QScopedPointer<Poppler::Page> page(doc->page(i));
0292 #else // not HAVE_POPPLERQT5
0293 #ifdef HAVE_POPPLERQT6
0294                 std::unique_ptr<Poppler::Page> page = doc->page(i);
0295 #endif // HAVE_POPPLERQT6
0296 #endif // HAVE_POPPLERQT5
0297                 if (!resultItem.textPreview.isEmpty()) resultItem.textPreview += QLatin1Char(' ');
0298                 resultItem.textPreview += QStringView{page->text(QRect()).simplified()}.left(maxTextLen);
0299             }
0300             resultItem.textPreview.remove(QStringLiteral("Microsoft Word - ")); ///< Some word processors need to put their name everywhere ...
0301             resultItem.downloadMode = DownloadMode::No;
0302             resultItem.relevance = origin == Entry::ftDOI ? 1.0 : (origin == QStringLiteral("eprint") ? 0.75 : 0.5);
0303             result << resultItem;
0304             progress = true;
0305         }
0306
0307         return progress;
0308     }
0309
0310     QUrl ieeeDocumentUrlToDownloadUrl(const QUrl &url) {
0311         /// Basic checking if provided URL is from IEEE Xplore
0312         if (!url.host().contains(QStringLiteral("ieeexplore.ieee.org")))
0313             return url;
0314
0315         /// Assuming URL looks like this:
0316         ///    https://ieeexplore.ieee.org/document/8092651
0317         static const QRegularExpression documentIdRegExp(QStringLiteral("/(\\d{6,})[/]?$"));
0318         const QRegularExpressionMatch documentIdRegExpMatch = documentIdRegExp.match(url.path());
0319         if (!documentIdRegExpMatch.hasMatch())
0320             return url;
0321
0322         /// Use document id extracted above to build URL to PDF file
0323         return QUrl(QStringLiteral("https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=") + documentIdRegExpMatch.captured(1));
0324     }
0325 };
0326
0327 FindPDF::FindPDF(QObject *parent)
0328         : QObject(parent), d(new Private(this))
0329 {
0330     /// nothing
0331 }
0332
0333 FindPDF::~FindPDF()
0334 {
0335     abort();
0336     delete d;
0337 }
0338
0339 bool FindPDF::search(const Entry &entry)
0340 {
0341     if (d->aliveCounter > 0) return false;
0342
0343     d->knownUrls.clear();
0344     d->result.clear();
0345     d->currentEntry = entry;
0346
0347     Q_EMIT progress(0, d->aliveCounter, 0);
0348
0349     /// Generate a string which contains the title's beginning
0350     QString searchWords;
0351     if (entry.contains(Entry::ftTitle)) {
0352 #if QT_VERSION >= 0x050e00
0353         const QStringList titleChunks = PlainTextValue::text(entry.value(Entry::ftTitle)).split(QStringLiteral(" "), Qt::SkipEmptyParts);
0354 #else // QT_VERSION < 0x050e00
0355         const QStringList titleChunks = PlainTextValue::text(entry.value(Entry::ftTitle)).split(QStringLiteral(" "), QString::SkipEmptyParts);
0356 #endif // QT_VERSION >= 0x050e00
0357         if (!titleChunks.isEmpty()) {
0358             searchWords = titleChunks[0];
0359             for (int i = 1; i < titleChunks.count() && searchWords.length() < 64; ++i)
0360                 searchWords += QLatin1Char(' ') + titleChunks[i];
0361         }
0362     }
0363     const QStringList authors = entry.authorsLastName();
0364     for (int i = 0; i < authors.count() && searchWords.length() < 96; ++i)
0365         searchWords += QLatin1Char(' ') + authors[i];
0366
0367     searchWords.remove(QLatin1Char('{')).remove(QLatin1Char('}'));
0368
0369     QStringList urlFields {Entry::ftDOI, Entry::ftUrl, QStringLiteral("ee")};
0370     for (int i = 2; i < 256; ++i)
0371         urlFields << QString(QStringLiteral("%1%2")).arg(Entry::ftDOI).arg(i) << QString(QStringLiteral("%1%2")).arg(Entry::ftUrl).arg(i);
0372     for (const QString &field : const_cast<const QStringList &>(urlFields)) {
0373         if (entry.contains(field)) {
0374             const QString fieldText = PlainTextValue::text(entry.value(field));
0375             QRegularExpressionMatchIterator doiRegExpMatchIt = KBibTeX::doiRegExp.globalMatch(fieldText);
0376             while (doiRegExpMatchIt.hasNext()) {
0377                 const QRegularExpressionMatch doiRegExpMatch = doiRegExpMatchIt.next();
0378                 const QString doiNumber{doiRegExpMatch.captured(QStringLiteral("doi"))};
0379                 d->queueUrl(QUrl(KBibTeX::doiUrlPrefix + doiNumber), fieldText, Entry::ftDOI, maxDepth);
0380 #ifdef HAVE_SCIHUB
0381                 static const QVector<QString> sciHubUrlPrefixes {{QStringLiteral("https://sci-hub.se/")}, {QStringLiteral("https://sci-hub.st/")}, {QStringLiteral("https://sci-hub.ru/")}};
0382                 d->queueUrl(QUrl::fromUserInput(sciHubUrlPrefixes[QRandomGenerator::global()->bounded(sciHubUrlPrefixes.length())] + doiNumber), fieldText, Entry::ftDOI, maxDepth);
0383 #endif // HAVE_SCIHUB
0384             }
0385
0386             QRegularExpressionMatchIterator urlRegExpMatchIt = KBibTeX::urlRegExp.globalMatch(fieldText);
0387             while (urlRegExpMatchIt.hasNext()) {
0388                 QRegularExpressionMatch urlRegExpMatch = urlRegExpMatchIt.next();
0389                 d->queueUrl(QUrl(urlRegExpMatch.captured(0)), searchWords, Entry::ftUrl, maxDepth);
0390             }
0391         }
0392     }
0393
0394     if (entry.contains(QStringLiteral("eprint"))) {
0395         /// check eprint fields as used for arXiv
0396         const QString eprintId = PlainTextValue::text(entry.value(QStringLiteral("eprint")));
0397         if (!eprintId.isEmpty()) {
0398             const QUrl arxivUrl = QUrl::fromUserInput(QStringLiteral("https://arxiv.org/search/advanced?terms-0-term=") + eprintId + QStringLiteral("&terms-0-field=report_num&size=50&order=-announced_date_first"));
0399             d->queueUrl(arxivUrl, eprintId, QStringLiteral("eprint"), maxDepth);
0400         }
0401     }
0402
0403     if (!searchWords.isEmpty()) {
0404         /// Search in Google
0405         const QUrl googleUrl = QUrl::fromUserInput(QStringLiteral("https://www.google.com/search?hl=en&sa=G&q=filetype:pdf ") + searchWords);
0406         d->queueUrl(googleUrl, searchWords, QStringLiteral("www.google"), maxDepth);
0407
0408         /// Search in Google Scholar
0409         const QUrl googleScholarUrl = QUrl::fromUserInput(QStringLiteral("https://scholar.google.com/scholar?hl=en&btnG=Search&as_sdt=1&q=filetype:pdf ") + searchWords);
0410         d->queueUrl(googleScholarUrl, searchWords, QStringLiteral("scholar.google"), maxDepth);
0411
0412         /// Search in Bing
0413         const QUrl bingUrl = QUrl::fromUserInput(QStringLiteral("https://www.bing.com/search?setlang=en-US&q=filetype:pdf ") + searchWords);
0414         d->queueUrl(bingUrl, searchWords, QStringLiteral("bing"), maxDepth);
0415
0416         /// Search in CiteSeerX
0417         const QUrl citeseerXurl = QUrl::fromUserInput(QStringLiteral("https://citeseerx.ist.psu.edu/search?submit=Search&sort=rlv&t=doc&q=") + searchWords);
0418         d->queueUrl(citeseerXurl, searchWords, QStringLiteral("citeseerx"), maxDepth);
0419
0420         /// Search in StartPage
0421         const QUrl startPageUrl = QUrl::fromUserInput(QStringLiteral("https://www.startpage.com/do/asearch?cat=web&cmd=process_search&language=english&engine0=v1all&abp=-1&t=white&nj=1&prf=23ad6aab054a88d3da5c443280cee596&suggestOn=0&query=filetype:pdf ") + searchWords);
0422         d->queueUrl(startPageUrl, searchWords, QStringLiteral("startpage"), maxDepth);
0423     }
0424
0425     if (d->aliveCounter == 0) {
0426         qCWarning(LOG_KBIBTEX_NETWORKING) << "Directly at start, no URLs are queue for a search -> this should never happen";
0427         Q_EMIT finished();
0428     }
0429
0430     return true;
0431 }
0432
0433 QList<FindPDF::ResultItem> FindPDF::results()
0434 {
0435     if (d->aliveCounter == 0)
0436         return d->result;
0437     else {
0438         /// Return empty list while search is running
0439         return QList<FindPDF::ResultItem>();
0440     }
0441 }
0442
0443 void FindPDF::abort() {
0444     QSet<QNetworkReply *>::Iterator it = d->runningDownloads.begin();
0445     while (it != d->runningDownloads.end()) {
0446         QNetworkReply *reply = *it;
0447         it = d->runningDownloads.erase(it);
0448         reply->abort();
0449     }
0450 }
0451
0452 void FindPDF::downloadFinished()
0453 {
0454     static const char *htmlHead1 = "<html", *htmlHead2 = "<HTML", *htmlHead3 = "<!doctype html>" /** ACM Digital Library */;
0455     static const char *pdfHead = "%PDF-";
0456
0457     --d->aliveCounter;
0458     Q_EMIT progress(d->knownUrls.count(), d->aliveCounter, d->result.count());
0459
0460     QNetworkReply *reply = static_cast<QNetworkReply *>(sender());
0461     d->runningDownloads.remove(reply);
0462     const QString term = reply->property(termProperty).toString();
0463     const QString origin = reply->property(originProperty).toString();
0464     bool depthOk = false;
0465     int depth = reply->property(depthProperty).toInt(&depthOk);
0466     if (!depthOk) depth = 0;
0467
0468     if (reply->error() == QNetworkReply::NoError) {
0469         const QByteArray data = reply->readAll();
0470
0471         QUrl redirUrl = reply->attribute(QNetworkRequest::RedirectionTargetAttribute).toUrl();
0472         redirUrl = redirUrl.isValid() ? reply->url().resolved(redirUrl) : QUrl();
0473         qCDebug(LOG_KBIBTEX_NETWORKING) << "finished Downloading " << reply->url().toDisplayString() << "   depth=" << depth  << "  d->aliveCounter=" << d->aliveCounter << "  data.size=" << data.size() << "  redirUrl=" << redirUrl.toDisplayString() << "   origin=" << origin;
0474
0475         if (redirUrl.isValid()) {
0476             redirUrl = d->ieeeDocumentUrlToDownloadUrl(redirUrl);
0477             d->queueUrl(redirUrl, term, origin, depth - 1);
0478         } else if (data.contains(htmlHead1) || data.contains(htmlHead2) || data.contains(htmlHead3)) {
0479             /// returned data is a HTML file, i.e. contains "<html"
0480
0481             /// check for limited depth before continuing
0482             if (depthOk && depth > 0) {
0483                 /// Get webpage as plain text
0484                 /// Assume UTF-8 data
0485                 const QString text = QString::fromUtf8(data.constData());
0486
0487                 /// regular expression to check if this is a Google Scholar result page
0488                 static const QRegularExpression googleScholarTitleRegExp(QStringLiteral("<title>[^>]* - Google Scholar</title>"));
0489                 /// regular expression to check if this is a SpringerLink page
0490                 static const QRegularExpression springerLinkTitleRegExp(QStringLiteral("<title>[^>]* - Springer - [^>]*</title>"));
0491                 /// regular expression to check if this is a CiteSeerX page
0492                 static const QRegularExpression citeseerxTitleRegExp(QStringLiteral("<title>CiteSeerX &mdash; [^>]*</title>"));
0493                 /// regular expression to check if this is a ACM Digital Library page
0494                 static const QString acmDigitalLibraryString(QStringLiteral("The ACM Digital Library is published by the Association for Computing Machinery"));
0495
0496                 if (googleScholarTitleRegExp.match(text).hasMatch())
0497                     d->processGoogleResult(reply, text);
0498                 else if (springerLinkTitleRegExp.match(text).hasMatch())
0499                     d->processSpringerLink(reply, text);
0500                 else if (citeseerxTitleRegExp.match(text).hasMatch())
0501                     d->processCiteSeerX(reply, text);
0502                 else if (text.contains(acmDigitalLibraryString))
0503                     d->processACMDigitalLibrary(reply, text);
0504                 else {
0505                     /// regular expression to extract title
0506                     static const QRegularExpression titleRegExp(QStringLiteral("<title>(.*?)</title>"));
0507                     const QRegularExpressionMatch match = titleRegExp.match(text);
0508                     if (match.hasMatch())
0509                         qCDebug(LOG_KBIBTEX_NETWORKING) << "Using general HTML processor for page" << match.captured(1) << " URL=" << reply->url().toDisplayString();
0510                     else
0511                         qCDebug(LOG_KBIBTEX_NETWORKING) << "Using general HTML processor for URL=" << reply->url().toDisplayString();
0512                     d->processGeneralHTML(reply, text);
0513                 }
0514             }
0515         } else if (data.contains(pdfHead)) {
0516             /// looks like a PDF file -> grab it
0517             const bool gotPDFfile = d->processPDF(reply, data);
0518             if (gotPDFfile)
0519                 Q_EMIT progress(d->knownUrls.count(), d->aliveCounter, d->result.count());
0520         } else {
0521             /// Assume UTF-8 data
0522             const QString text = QString::fromUtf8(data.constData());
0523             qCWarning(LOG_KBIBTEX_NETWORKING) << "don't know how to handle " << text.left(256);
0524         }
0525     } else
0526         qCWarning(LOG_KBIBTEX_NETWORKING) << "error from reply: " << reply->errorString() << "(" << reply->url().toDisplayString() << ")" << "  term=" << term << "  origin=" << origin << "  depth=" << depth;
0527
0528     if (d->aliveCounter == 0) {
0529         /// no more running downloads left
0530         Q_EMIT finished();
0531     }
0532 }