File indexing completed on 2024-05-19 05:05:45
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "findpdf.h" 0021 0022 #include <QNetworkReply> 0023 #include <QNetworkRequest> 0024 #include <QRegularExpression> 0025 #include <QApplication> 0026 #include <QTemporaryFile> 0027 #include <QUrlQuery> 0028 #include <QStandardPaths> 0029 #include <QDir> 0030 #ifdef HAVE_SCIHUB 0031 #include <QRandomGenerator> 0032 #endif // HAVE_SCIHUB 0033 0034 #ifdef HAVE_POPPLERQT5 0035 #include <poppler-qt5.h> 0036 #else // not HAVE_POPPLERQT5 0037 #ifdef HAVE_POPPLERQT6 0038 #include <poppler-qt6.h> 0039 #endif // HAVE_POPPLERQT6 0040 #endif // HAVE_POPPLERQT5 0041 0042 #include <KBibTeX> 0043 #include <Value> 0044 #include <FileInfo> 0045 #include "internalnetworkaccessmanager.h" 0046 #include "logging_networking.h" 0047 0048 static const int maxDepth = 5; 0049 static const char *depthProperty = "depth"; 0050 static const char *termProperty = "term"; 0051 static const char *originProperty = "origin"; 0052 0053 0054 class FindPDF::Private 0055 { 0056 private: 0057 FindPDF *p; 0058 0059 /** 0060 * @brief Remove the fragment part of an URL (i.e. everything starting from '#') 0061 * @param u QUrl which may contain a fragment part or not 0062 * @return QUrl with the fragment part removed if there was any 0063 */ 0064 inline QUrl removeFragment(const QUrl &u) { 0065 if (u.hasFragment()) { 0066 QUrl _u{u}; 0067 _u.setFragment(QString()); 0068 return _u; 0069 } else 0070 return u; 0071 } 0072 0073 public: 0074 int aliveCounter; 0075 QList<ResultItem> result; 0076 Entry currentEntry; 0077 QSet<QUrl> knownUrls; 0078 QSet<QNetworkReply *> runningDownloads; 0079 0080 Private(FindPDF *parent) 0081 : p(parent), aliveCounter(0) 0082 { 0083 /// nothing 0084 } 0085 0086 bool queueUrl(const QUrl &url, const QString &term, const QString &origin, int depth) 0087 { 0088 const QUrl sanitizedUrl{removeFragment(url)}; 0089 0090 if (!knownUrls.contains(sanitizedUrl) && depth > 0) { 0091 knownUrls.insert(sanitizedUrl); 0092 QNetworkRequest request = QNetworkRequest(sanitizedUrl); 0093 QNetworkReply *reply = InternalNetworkAccessManager::instance().get(request); 0094 InternalNetworkAccessManager::instance().setNetworkReplyTimeout(reply, 15); ///< set a timeout on network connections 0095 reply->setProperty(depthProperty, QVariant::fromValue<int>(depth)); 0096 reply->setProperty(termProperty, term); 0097 reply->setProperty(originProperty, origin); 0098 runningDownloads.insert(reply); 0099 connect(reply, &QNetworkReply::finished, p, &FindPDF::downloadFinished); 0100 ++aliveCounter; 0101 return true; 0102 } else 0103 return false; 0104 } 0105 0106 void processGeneralHTML(QNetworkReply *reply, const QString &text) 0107 { 0108 /// fetch some properties from Reply object 0109 const QString term = reply->property(termProperty).toString(); 0110 const QString origin = reply->property(originProperty).toString(); 0111 const QUrl baseUrl{reply->url()}; 0112 bool ok = false; 0113 int depth = reply->property(depthProperty).toInt(&ok); 0114 if (!ok) depth = 0; 0115 0116 /// regular expressions to guess links to follow 0117 const QVector<QRegularExpression> specificAnchorRegExp = { 0118 QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]*%1[^\"]*[.]pdf)\"")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption), 0119 QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]+)\"[^>]*>[^<]*%1[^<]*[.]pdf")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption), 0120 QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]*%1[^\"]*)\"")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption), 0121 QRegularExpression(QString(QStringLiteral("<a[^>]*href=\"([^\"]+)\"[^>]*>[^<]*%1[^<]*\\b")).arg(QRegularExpression::escape(term)), QRegularExpression::CaseInsensitiveOption) 0122 }; 0123 static const QRegularExpression genericAnchorRegExp = QRegularExpression(QStringLiteral("<a[^>]*href=\"([^\"]+)\""), QRegularExpression::CaseInsensitiveOption); 0124 0125 bool gotLink = false; 0126 for (const QRegularExpression &anchorRegExp : specificAnchorRegExp) { 0127 const QRegularExpressionMatch match = anchorRegExp.match(text); 0128 if (match.hasMatch()) { 0129 const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1()); 0130 queueUrl(baseUrl.resolved(url), term, origin, depth - 1); 0131 gotLink = true; 0132 break; 0133 } 0134 } 0135 0136 if (!gotLink) { 0137 /// this is only the last resort: 0138 /// to follow the first link found in the HTML document 0139 const QRegularExpressionMatch match = genericAnchorRegExp.match(text); 0140 if (match.hasMatch()) { 0141 const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1()); 0142 queueUrl(baseUrl.resolved(url), term, origin, depth - 1); 0143 } 0144 } 0145 0146 if (baseUrl.isValid()) { 0147 int p1 = text.indexOf(QStringLiteral("<embed")); 0148 while (p1 > 0) { 0149 const int p2 = text.indexOf(QStringLiteral(">"), p1 + 5); 0150 if (p2 > p1) { 0151 const int p3 = text.indexOf(QStringLiteral("type=\"application/pdf\""), p1 + 5); 0152 if (p3 > p1 && p3 < p2) { 0153 const int p4 = text.indexOf(QStringLiteral("src=\""), p1 + 5); 0154 if (p4 > p1 && p4 < p2) { 0155 const int p5 = text.indexOf(QStringLiteral("\""), p4 + 5); 0156 if (p5 > p4 && p5 < p2) { 0157 const QString src{text.mid(p4 + 5, p5 - p4)}; 0158 QUrl nextUrl; 0159 if (src.startsWith(QStringLiteral("http"))) 0160 nextUrl = QUrl::fromUserInput(src); 0161 else if (src.startsWith(QStringLiteral("//"))) 0162 nextUrl = QUrl::fromUserInput(baseUrl.scheme() + QStringLiteral(":") + src); 0163 else if (src.startsWith(QStringLiteral("/"))) 0164 nextUrl = QUrl::fromUserInput(baseUrl.scheme() + QStringLiteral("://") + baseUrl.host() + src); 0165 else 0166 nextUrl = baseUrl.resolved(QUrl(src)); 0167 queueUrl(nextUrl, term, origin, depth - 1); 0168 } 0169 } 0170 } 0171 p1 = text.indexOf(QStringLiteral("<embed"), p2); 0172 } else 0173 break; 0174 } 0175 } 0176 } 0177 0178 void processGoogleResult(QNetworkReply *reply, const QString &text) 0179 { 0180 static const QString h3Tag(QStringLiteral("<h3")); 0181 static const QString aTag(QStringLiteral("<a")); 0182 static const QString hrefAttrib(QStringLiteral("href=\"")); 0183 0184 const QString term = reply->property(termProperty).toString(); 0185 bool ok = false; 0186 int depth = reply->property(depthProperty).toInt(&ok); 0187 if (!ok) depth = 0; 0188 0189 /// extract the first numHitsToFollow-many hits found by Google Scholar 0190 const int numHitsToFollow = 10; 0191 int p = -1; 0192 for (int i = 0; i < numHitsToFollow; ++i) { 0193 if ((p = text.indexOf(h3Tag, p + 1)) >= 0 && (p = text.indexOf(aTag, p + 1)) >= 0 && (p = text.indexOf(hrefAttrib, p + 1)) >= 0) { 0194 int p1 = p + 6; 0195 int p2 = text.indexOf(QLatin1Char('"'), p1 + 1); 0196 QUrl url(text.mid(p1, p2 - p1)); 0197 const QString googleService = reply->url().host().contains(QStringLiteral("scholar.google")) ? QStringLiteral("scholar.google") : QStringLiteral("www.google"); 0198 queueUrl(reply->url().resolved(url), term, googleService, depth - 1); 0199 } 0200 } 0201 } 0202 0203 void processSpringerLink(QNetworkReply *reply, const QString &text) 0204 { 0205 static const QRegularExpression fulltextPDFlink(QStringLiteral("href=\"([^\"]+/fulltext.pdf)\"")); 0206 const QRegularExpressionMatch match = fulltextPDFlink.match(text); 0207 if (match.hasMatch()) { 0208 bool ok = false; 0209 int depth = reply->property(depthProperty).toInt(&ok); 0210 if (!ok) depth = 0; 0211 0212 const QUrl url(match.captured(1)); 0213 queueUrl(reply->url().resolved(url), QString(), QStringLiteral("springerlink"), depth - 1); 0214 } 0215 } 0216 0217 void processCiteSeerX(QNetworkReply *reply, const QString &text) 0218 { 0219 static const QRegularExpression downloadPDFlink(QStringLiteral("href=\"(/viewdoc/download[^\"]+type=pdf)\"")); 0220 const QRegularExpressionMatch match = downloadPDFlink.match(text); 0221 if (match.hasMatch()) { 0222 bool ok = false; 0223 int depth = reply->property(depthProperty).toInt(&ok); 0224 if (!ok) depth = 0; 0225 0226 const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1()); 0227 queueUrl(reply->url().resolved(url), QString(), QStringLiteral("citeseerx"), depth - 1); 0228 } 0229 } 0230 0231 void processACMDigitalLibrary(QNetworkReply *reply, const QString &text) 0232 { 0233 static const QRegularExpression downloadPDFlink(QStringLiteral("href=\"(ft_gateway.cfm\\?id=\\d+&ftid=\\d+&dwn=1&CFID=\\d+&CFTOKEN=\\d+)\"")); 0234 const QRegularExpressionMatch match = downloadPDFlink.match(text); 0235 if (match.hasMatch()) { 0236 bool ok = false; 0237 int depth = reply->property(depthProperty).toInt(&ok); 0238 if (!ok) depth = 0; 0239 0240 const QUrl url = QUrl::fromEncoded(match.captured(1).toLatin1()); 0241 queueUrl(reply->url().resolved(url), QString(), QStringLiteral("acmdl"), depth - 1); 0242 } 0243 } 0244 0245 bool processPDF(QNetworkReply *reply, const QByteArray &data) 0246 { 0247 bool progress = false; 0248 const QString origin = reply->property(originProperty).toString(); 0249 const QUrl url = reply->url(); 0250 0251 /// Search for duplicate URLs 0252 bool containsUrl = false; 0253 for (const ResultItem &ri : const_cast<const QList<ResultItem> &>(result)) { 0254 containsUrl |= ri.url == url; 0255 /// Skip already visited URLs 0256 if (containsUrl) break; 0257 } 0258 0259 if (!containsUrl) { 0260 #ifdef HAVE_POPPLERQT5 0261 QScopedPointer<Poppler::Document> doc(Poppler::Document::loadFromData(data)); 0262 #else // not HAVE_POPPLERQT5 0263 #ifdef HAVE_POPPLERQT6 0264 std::unique_ptr<Poppler::Document> doc = Poppler::Document::loadFromData(data); 0265 #endif // HAVE_POPPLERQT6 0266 #endif // HAVE_POPPLERQT5 0267 0268 ResultItem resultItem; 0269 resultItem.tempFilename = new QTemporaryFile(QStandardPaths::writableLocation(QStandardPaths::TempLocation) + QDir::separator() + QStringLiteral("kbibtex_findpdf_XXXXXX.pdf")); 0270 resultItem.tempFilename->setAutoRemove(true); 0271 if (resultItem.tempFilename->open()) { 0272 const int lenDataWritten = static_cast<int>(resultItem.tempFilename->write(data)); 0273 resultItem.tempFilename->close(); 0274 if (lenDataWritten != data.length()) { 0275 /// Failed to write to temporary file 0276 qCWarning(LOG_KBIBTEX_NETWORKING) << "Failed to write to temporary file for filename" << resultItem.tempFilename->fileName(); 0277 delete resultItem.tempFilename; 0278 resultItem.tempFilename = nullptr; 0279 } 0280 } else { 0281 /// Failed to create temporary file 0282 qCWarning(LOG_KBIBTEX_NETWORKING) << "Failed to create temporary file for templaet" << resultItem.tempFilename->fileTemplate(); 0283 delete resultItem.tempFilename; 0284 resultItem.tempFilename = nullptr; 0285 } 0286 resultItem.url = url; 0287 resultItem.textPreview = doc->info(QStringLiteral("Title")).simplified(); 0288 static const int maxTextLen = 1024; 0289 for (int i = 0; i < doc->numPages() && resultItem.textPreview.length() < maxTextLen; ++i) { 0290 #ifdef HAVE_POPPLERQT5 0291 QScopedPointer<Poppler::Page> page(doc->page(i)); 0292 #else // not HAVE_POPPLERQT5 0293 #ifdef HAVE_POPPLERQT6 0294 std::unique_ptr<Poppler::Page> page = doc->page(i); 0295 #endif // HAVE_POPPLERQT6 0296 #endif // HAVE_POPPLERQT5 0297 if (!resultItem.textPreview.isEmpty()) resultItem.textPreview += QLatin1Char(' '); 0298 resultItem.textPreview += QStringView{page->text(QRect()).simplified()}.left(maxTextLen); 0299 } 0300 resultItem.textPreview.remove(QStringLiteral("Microsoft Word - ")); ///< Some word processors need to put their name everywhere ... 0301 resultItem.downloadMode = DownloadMode::No; 0302 resultItem.relevance = origin == Entry::ftDOI ? 1.0 : (origin == QStringLiteral("eprint") ? 0.75 : 0.5); 0303 result << resultItem; 0304 progress = true; 0305 } 0306 0307 return progress; 0308 } 0309 0310 QUrl ieeeDocumentUrlToDownloadUrl(const QUrl &url) { 0311 /// Basic checking if provided URL is from IEEE Xplore 0312 if (!url.host().contains(QStringLiteral("ieeexplore.ieee.org"))) 0313 return url; 0314 0315 /// Assuming URL looks like this: 0316 /// https://ieeexplore.ieee.org/document/8092651 0317 static const QRegularExpression documentIdRegExp(QStringLiteral("/(\\d{6,})[/]?$")); 0318 const QRegularExpressionMatch documentIdRegExpMatch = documentIdRegExp.match(url.path()); 0319 if (!documentIdRegExpMatch.hasMatch()) 0320 return url; 0321 0322 /// Use document id extracted above to build URL to PDF file 0323 return QUrl(QStringLiteral("https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=") + documentIdRegExpMatch.captured(1)); 0324 } 0325 }; 0326 0327 FindPDF::FindPDF(QObject *parent) 0328 : QObject(parent), d(new Private(this)) 0329 { 0330 /// nothing 0331 } 0332 0333 FindPDF::~FindPDF() 0334 { 0335 abort(); 0336 delete d; 0337 } 0338 0339 bool FindPDF::search(const Entry &entry) 0340 { 0341 if (d->aliveCounter > 0) return false; 0342 0343 d->knownUrls.clear(); 0344 d->result.clear(); 0345 d->currentEntry = entry; 0346 0347 Q_EMIT progress(0, d->aliveCounter, 0); 0348 0349 /// Generate a string which contains the title's beginning 0350 QString searchWords; 0351 if (entry.contains(Entry::ftTitle)) { 0352 #if QT_VERSION >= 0x050e00 0353 const QStringList titleChunks = PlainTextValue::text(entry.value(Entry::ftTitle)).split(QStringLiteral(" "), Qt::SkipEmptyParts); 0354 #else // QT_VERSION < 0x050e00 0355 const QStringList titleChunks = PlainTextValue::text(entry.value(Entry::ftTitle)).split(QStringLiteral(" "), QString::SkipEmptyParts); 0356 #endif // QT_VERSION >= 0x050e00 0357 if (!titleChunks.isEmpty()) { 0358 searchWords = titleChunks[0]; 0359 for (int i = 1; i < titleChunks.count() && searchWords.length() < 64; ++i) 0360 searchWords += QLatin1Char(' ') + titleChunks[i]; 0361 } 0362 } 0363 const QStringList authors = entry.authorsLastName(); 0364 for (int i = 0; i < authors.count() && searchWords.length() < 96; ++i) 0365 searchWords += QLatin1Char(' ') + authors[i]; 0366 0367 searchWords.remove(QLatin1Char('{')).remove(QLatin1Char('}')); 0368 0369 QStringList urlFields {Entry::ftDOI, Entry::ftUrl, QStringLiteral("ee")}; 0370 for (int i = 2; i < 256; ++i) 0371 urlFields << QString(QStringLiteral("%1%2")).arg(Entry::ftDOI).arg(i) << QString(QStringLiteral("%1%2")).arg(Entry::ftUrl).arg(i); 0372 for (const QString &field : const_cast<const QStringList &>(urlFields)) { 0373 if (entry.contains(field)) { 0374 const QString fieldText = PlainTextValue::text(entry.value(field)); 0375 QRegularExpressionMatchIterator doiRegExpMatchIt = KBibTeX::doiRegExp.globalMatch(fieldText); 0376 while (doiRegExpMatchIt.hasNext()) { 0377 const QRegularExpressionMatch doiRegExpMatch = doiRegExpMatchIt.next(); 0378 const QString doiNumber{doiRegExpMatch.captured(QStringLiteral("doi"))}; 0379 d->queueUrl(QUrl(KBibTeX::doiUrlPrefix + doiNumber), fieldText, Entry::ftDOI, maxDepth); 0380 #ifdef HAVE_SCIHUB 0381 static const QVector<QString> sciHubUrlPrefixes {{QStringLiteral("https://sci-hub.se/")}, {QStringLiteral("https://sci-hub.st/")}, {QStringLiteral("https://sci-hub.ru/")}}; 0382 d->queueUrl(QUrl::fromUserInput(sciHubUrlPrefixes[QRandomGenerator::global()->bounded(sciHubUrlPrefixes.length())] + doiNumber), fieldText, Entry::ftDOI, maxDepth); 0383 #endif // HAVE_SCIHUB 0384 } 0385 0386 QRegularExpressionMatchIterator urlRegExpMatchIt = KBibTeX::urlRegExp.globalMatch(fieldText); 0387 while (urlRegExpMatchIt.hasNext()) { 0388 QRegularExpressionMatch urlRegExpMatch = urlRegExpMatchIt.next(); 0389 d->queueUrl(QUrl(urlRegExpMatch.captured(0)), searchWords, Entry::ftUrl, maxDepth); 0390 } 0391 } 0392 } 0393 0394 if (entry.contains(QStringLiteral("eprint"))) { 0395 /// check eprint fields as used for arXiv 0396 const QString eprintId = PlainTextValue::text(entry.value(QStringLiteral("eprint"))); 0397 if (!eprintId.isEmpty()) { 0398 const QUrl arxivUrl = QUrl::fromUserInput(QStringLiteral("https://arxiv.org/search/advanced?terms-0-term=") + eprintId + QStringLiteral("&terms-0-field=report_num&size=50&order=-announced_date_first")); 0399 d->queueUrl(arxivUrl, eprintId, QStringLiteral("eprint"), maxDepth); 0400 } 0401 } 0402 0403 if (!searchWords.isEmpty()) { 0404 /// Search in Google 0405 const QUrl googleUrl = QUrl::fromUserInput(QStringLiteral("https://www.google.com/search?hl=en&sa=G&q=filetype:pdf ") + searchWords); 0406 d->queueUrl(googleUrl, searchWords, QStringLiteral("www.google"), maxDepth); 0407 0408 /// Search in Google Scholar 0409 const QUrl googleScholarUrl = QUrl::fromUserInput(QStringLiteral("https://scholar.google.com/scholar?hl=en&btnG=Search&as_sdt=1&q=filetype:pdf ") + searchWords); 0410 d->queueUrl(googleScholarUrl, searchWords, QStringLiteral("scholar.google"), maxDepth); 0411 0412 /// Search in Bing 0413 const QUrl bingUrl = QUrl::fromUserInput(QStringLiteral("https://www.bing.com/search?setlang=en-US&q=filetype:pdf ") + searchWords); 0414 d->queueUrl(bingUrl, searchWords, QStringLiteral("bing"), maxDepth); 0415 0416 /// Search in CiteSeerX 0417 const QUrl citeseerXurl = QUrl::fromUserInput(QStringLiteral("https://citeseerx.ist.psu.edu/search?submit=Search&sort=rlv&t=doc&q=") + searchWords); 0418 d->queueUrl(citeseerXurl, searchWords, QStringLiteral("citeseerx"), maxDepth); 0419 0420 /// Search in StartPage 0421 const QUrl startPageUrl = QUrl::fromUserInput(QStringLiteral("https://www.startpage.com/do/asearch?cat=web&cmd=process_search&language=english&engine0=v1all&abp=-1&t=white&nj=1&prf=23ad6aab054a88d3da5c443280cee596&suggestOn=0&query=filetype:pdf ") + searchWords); 0422 d->queueUrl(startPageUrl, searchWords, QStringLiteral("startpage"), maxDepth); 0423 } 0424 0425 if (d->aliveCounter == 0) { 0426 qCWarning(LOG_KBIBTEX_NETWORKING) << "Directly at start, no URLs are queue for a search -> this should never happen"; 0427 Q_EMIT finished(); 0428 } 0429 0430 return true; 0431 } 0432 0433 QList<FindPDF::ResultItem> FindPDF::results() 0434 { 0435 if (d->aliveCounter == 0) 0436 return d->result; 0437 else { 0438 /// Return empty list while search is running 0439 return QList<FindPDF::ResultItem>(); 0440 } 0441 } 0442 0443 void FindPDF::abort() { 0444 QSet<QNetworkReply *>::Iterator it = d->runningDownloads.begin(); 0445 while (it != d->runningDownloads.end()) { 0446 QNetworkReply *reply = *it; 0447 it = d->runningDownloads.erase(it); 0448 reply->abort(); 0449 } 0450 } 0451 0452 void FindPDF::downloadFinished() 0453 { 0454 static const char *htmlHead1 = "<html", *htmlHead2 = "<HTML", *htmlHead3 = "<!doctype html>" /** ACM Digital Library */; 0455 static const char *pdfHead = "%PDF-"; 0456 0457 --d->aliveCounter; 0458 Q_EMIT progress(d->knownUrls.count(), d->aliveCounter, d->result.count()); 0459 0460 QNetworkReply *reply = static_cast<QNetworkReply *>(sender()); 0461 d->runningDownloads.remove(reply); 0462 const QString term = reply->property(termProperty).toString(); 0463 const QString origin = reply->property(originProperty).toString(); 0464 bool depthOk = false; 0465 int depth = reply->property(depthProperty).toInt(&depthOk); 0466 if (!depthOk) depth = 0; 0467 0468 if (reply->error() == QNetworkReply::NoError) { 0469 const QByteArray data = reply->readAll(); 0470 0471 QUrl redirUrl = reply->attribute(QNetworkRequest::RedirectionTargetAttribute).toUrl(); 0472 redirUrl = redirUrl.isValid() ? reply->url().resolved(redirUrl) : QUrl(); 0473 qCDebug(LOG_KBIBTEX_NETWORKING) << "finished Downloading " << reply->url().toDisplayString() << " depth=" << depth << " d->aliveCounter=" << d->aliveCounter << " data.size=" << data.size() << " redirUrl=" << redirUrl.toDisplayString() << " origin=" << origin; 0474 0475 if (redirUrl.isValid()) { 0476 redirUrl = d->ieeeDocumentUrlToDownloadUrl(redirUrl); 0477 d->queueUrl(redirUrl, term, origin, depth - 1); 0478 } else if (data.contains(htmlHead1) || data.contains(htmlHead2) || data.contains(htmlHead3)) { 0479 /// returned data is a HTML file, i.e. contains "<html" 0480 0481 /// check for limited depth before continuing 0482 if (depthOk && depth > 0) { 0483 /// Get webpage as plain text 0484 /// Assume UTF-8 data 0485 const QString text = QString::fromUtf8(data.constData()); 0486 0487 /// regular expression to check if this is a Google Scholar result page 0488 static const QRegularExpression googleScholarTitleRegExp(QStringLiteral("<title>[^>]* - Google Scholar</title>")); 0489 /// regular expression to check if this is a SpringerLink page 0490 static const QRegularExpression springerLinkTitleRegExp(QStringLiteral("<title>[^>]* - Springer - [^>]*</title>")); 0491 /// regular expression to check if this is a CiteSeerX page 0492 static const QRegularExpression citeseerxTitleRegExp(QStringLiteral("<title>CiteSeerX — [^>]*</title>")); 0493 /// regular expression to check if this is a ACM Digital Library page 0494 static const QString acmDigitalLibraryString(QStringLiteral("The ACM Digital Library is published by the Association for Computing Machinery")); 0495 0496 if (googleScholarTitleRegExp.match(text).hasMatch()) 0497 d->processGoogleResult(reply, text); 0498 else if (springerLinkTitleRegExp.match(text).hasMatch()) 0499 d->processSpringerLink(reply, text); 0500 else if (citeseerxTitleRegExp.match(text).hasMatch()) 0501 d->processCiteSeerX(reply, text); 0502 else if (text.contains(acmDigitalLibraryString)) 0503 d->processACMDigitalLibrary(reply, text); 0504 else { 0505 /// regular expression to extract title 0506 static const QRegularExpression titleRegExp(QStringLiteral("<title>(.*?)</title>")); 0507 const QRegularExpressionMatch match = titleRegExp.match(text); 0508 if (match.hasMatch()) 0509 qCDebug(LOG_KBIBTEX_NETWORKING) << "Using general HTML processor for page" << match.captured(1) << " URL=" << reply->url().toDisplayString(); 0510 else 0511 qCDebug(LOG_KBIBTEX_NETWORKING) << "Using general HTML processor for URL=" << reply->url().toDisplayString(); 0512 d->processGeneralHTML(reply, text); 0513 } 0514 } 0515 } else if (data.contains(pdfHead)) { 0516 /// looks like a PDF file -> grab it 0517 const bool gotPDFfile = d->processPDF(reply, data); 0518 if (gotPDFfile) 0519 Q_EMIT progress(d->knownUrls.count(), d->aliveCounter, d->result.count()); 0520 } else { 0521 /// Assume UTF-8 data 0522 const QString text = QString::fromUtf8(data.constData()); 0523 qCWarning(LOG_KBIBTEX_NETWORKING) << "don't know how to handle " << text.left(256); 0524 } 0525 } else 0526 qCWarning(LOG_KBIBTEX_NETWORKING) << "error from reply: " << reply->errorString() << "(" << reply->url().toDisplayString() << ")" << " term=" << term << " origin=" << origin << " depth=" << depth; 0527 0528 if (d->aliveCounter == 0) { 0529 /// no more running downloads left 0530 Q_EMIT finished(); 0531 } 0532 }