File indexing completed on 2024-05-19 05:05:37
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "fileinfo.h" 0021 0022 #ifdef HAVE_POPPLERQT5 0023 #include <poppler-qt5.h> 0024 #endif // HAVE_POPPLERQT5 0025 0026 #include <QFileInfo> 0027 #include <QMimeDatabase> 0028 #include <QDir> 0029 #include <QTextStream> 0030 #include <QStandardPaths> 0031 #include <QRegularExpression> 0032 #ifdef HAVE_QTCONCURRENT 0033 #include <QtConcurrentRun> 0034 #endif // HAVE_QTCONCURRENT 0035 0036 #include <KBibTeX> 0037 #include <Entry> 0038 #include "logging_io.h" 0039 0040 FileInfo::FileInfo() 0041 { 0042 /// nothing 0043 } 0044 0045 const QString FileInfo::mimetypeOctetStream = QStringLiteral("application/octet-stream"); 0046 const QString FileInfo::mimetypeHTML = QStringLiteral("text/html"); 0047 const QString FileInfo::mimetypeBibTeX = QStringLiteral("text/x-bibtex"); 0048 const QString FileInfo::mimetypeRIS = QStringLiteral("application/x-research-info-systems"); 0049 const QString FileInfo::mimetypePDF = QStringLiteral("application/pdf"); 0050 0051 QMimeType FileInfo::mimeTypeForUrl(const QUrl &url) 0052 { 0053 if (!url.isValid()) { 0054 qCWarning(LOG_KBIBTEX_IO) << "Cannot determine mime type for empty or invalid QUrl"; 0055 return QMimeType(); ///< invalid input gives invalid mime type 0056 } 0057 0058 static const QMimeDatabase db; 0059 static const QMimeType mtHTML(db.mimeTypeForName(mimetypeHTML)); 0060 static const QMimeType mtOctetStream(db.mimeTypeForName(mimetypeOctetStream)); 0061 static const QMimeType mtBibTeX(db.mimeTypeForName(mimetypeBibTeX)); 0062 static const QMimeType mtPDF(db.mimeTypeForName(mimetypePDF)); 0063 static const QMimeType mtRIS(db.mimeTypeForName(mimetypeRIS)); 0064 0065 /// Test if mime type for BibTeX is registered before determining file extension 0066 static const QString mimetypeBibTeXExt = mtBibTeX.preferredSuffix(); 0067 /// Test if mime type for RIS is registered before determining file extension 0068 static const QString mimetypeRISExt = mtRIS.preferredSuffix(); 0069 /// Test if mime type for PDF is registered before determining file extension 0070 static const QString mimetypePDFExt = mtPDF.preferredSuffix(); 0071 0072 const QString extension = db.suffixForFileName(url.fileName()).toLower(); 0073 /// First, check preferred suffixes 0074 if (mtBibTeX.isValid() && extension == mimetypeBibTeXExt) 0075 return mtBibTeX; 0076 else if (mtRIS.isValid() && extension == mimetypeRISExt) 0077 return mtRIS; 0078 else if (extension == mimetypePDFExt) 0079 return mtPDF; 0080 /// Second, check any other suffixes 0081 else if (mtBibTeX.isValid() && mtBibTeX.suffixes().contains(extension)) 0082 return mtBibTeX; 0083 else if (mtRIS.isValid() && mtRIS.suffixes().contains(extension)) 0084 return mtRIS; 0085 else if (mtPDF.suffixes().contains(extension)) 0086 return mtPDF; 0087 0088 /// Let the KDE subsystem guess the mime type 0089 QMimeType result = db.mimeTypeForUrl(url); 0090 /// Fall back to application/octet-stream if something goes wrong 0091 if (!result.isValid()) 0092 result = mtOctetStream; 0093 0094 /// In case that KDE could not determine mime type, 0095 /// do some educated guesses on our own 0096 if (result.name() == mimetypeOctetStream) { 0097 if (url.scheme().startsWith(QStringLiteral("http"))) 0098 result = mtHTML; 0099 // TODO more tests? 0100 } 0101 0102 return result; 0103 } 0104 0105 void FileInfo::urlsInText(const QString &text, const TestExistence testExistence, const QString &baseDirectory, QSet<QUrl> &result) 0106 { 0107 if (text.isEmpty()) 0108 return; 0109 0110 /// DOI identifiers have to extracted first as KBibTeX::fileListSeparatorRegExp 0111 /// contains characters that can be part of a DOI (e.g. ';') and thus could split 0112 /// a DOI in between. 0113 QString internalText = text; 0114 int pos = 0; 0115 QRegularExpressionMatch doiRegExpMatch; 0116 while ((doiRegExpMatch = KBibTeX::doiRegExp.match(internalText, pos)).hasMatch()) { 0117 pos = doiRegExpMatch.capturedStart(QStringLiteral("doi")); 0118 QString doiMatch = doiRegExpMatch.captured(QStringLiteral("doi")); 0119 const int semicolonHttpPos = doiMatch.indexOf(QStringLiteral(";http")); 0120 if (semicolonHttpPos > 0) doiMatch = doiMatch.left(semicolonHttpPos); 0121 const QUrl url(KBibTeX::doiUrlPrefix + QString(doiMatch).remove(QStringLiteral("\\"))); 0122 if (url.isValid() && !result.contains(url)) 0123 result << url; 0124 /// remove match from internal text to avoid duplicates 0125 0126 /// Cut away any URL that may be right before found DOI number: 0127 /// For example, if DOI '10.1000/38-abc' was found in 0128 /// 'Lore ipsum http://doi.example.org/10.1000/38-abc Lore ipsum' 0129 /// also remove 'http://doi.example.org/' from the text, keeping only 0130 /// 'Lore ipsum Lore ipsum' 0131 static const QRegularExpression genericDoiUrlPrefix(QStringLiteral("http[s]?://[a-z0-9./-]+/$")); ///< looks like an URL 0132 const QRegularExpressionMatch genericDoiUrlPrefixMatch = genericDoiUrlPrefix.match(internalText.left(pos)); 0133 if (genericDoiUrlPrefixMatch.hasMatch()) 0134 /// genericDoiUrlPrefixMatch.captured(0) may contain (parts of) DOI 0135 internalText = internalText.left(genericDoiUrlPrefixMatch.capturedStart(0)) + internalText.mid(pos + doiMatch.length()); 0136 else 0137 internalText = internalText.left(pos) + internalText.mid(pos + doiMatch.length()); 0138 } 0139 0140 #if QT_VERSION >= 0x050e00 0141 const QStringList fileList = internalText.split(KBibTeX::fileListSeparatorRegExp, Qt::SkipEmptyParts); 0142 #else // QT_VERSION < 0x050e00 0143 const QStringList fileList = internalText.split(KBibTeX::fileListSeparatorRegExp, QString::SkipEmptyParts); 0144 #endif // QT_VERSION >= 0x050e00 0145 for (const QString &text : fileList) { 0146 internalText = text; 0147 0148 /// If testing for the actual existence of a filename found in the text ... 0149 if (testExistence == TestExistence::Yes) { 0150 /// If a base directory (e.g. the location of the parent .bib file) is given 0151 /// and the potential filename fragment is NOT an absolute path, ... 0152 if (internalText.startsWith(QStringLiteral("~") + QDir::separator())) { 0153 const QString fullFilename = QDir::homePath() + internalText.mid(1); 0154 const QFileInfo fileInfo(fullFilename); 0155 const QUrl url = QUrl::fromLocalFile(fileInfo.absoluteFilePath()); 0156 if (fileInfo.exists() && fileInfo.isFile() && url.isValid() && !result.contains(url)) { 0157 result << url; 0158 /// Stop searching for URLs or filenames in current internal text 0159 continue; 0160 } 0161 } else if (!baseDirectory.isEmpty() && 0162 // TODO the following test assumes that absolute paths start 0163 // with a dir separator, which may only be true on Unix/Linux, 0164 // but not Windows. May be a test for 'first character is a letter, 0165 // second is ":", third is "\"' may be necessary. 0166 !internalText.startsWith(QDir::separator())) { 0167 /// To get the absolute path, prepend filename fragment with base directory 0168 const QString fullFilename = baseDirectory + QDir::separator() + internalText; 0169 const QFileInfo fileInfo(fullFilename); 0170 const QUrl url = QUrl::fromLocalFile(fileInfo.absoluteFilePath()); 0171 if (fileInfo.exists() && fileInfo.isFile() && url.isValid() && !result.contains(url)) { 0172 result << url; 0173 /// Stop searching for URLs or filenames in current internal text 0174 continue; 0175 } 0176 } else { 0177 /// Either the filename fragment is an absolute path OR no base directory 0178 /// was given (current working directory is assumed), ... 0179 const QFileInfo fileInfo(internalText); 0180 const QUrl url = QUrl::fromLocalFile(fileInfo.absoluteFilePath()); 0181 if (fileInfo.exists() && fileInfo.isFile() && url.isValid() && !result.contains(url)) { 0182 result << url; 0183 /// stop searching for URLs or filenames in current internal text 0184 continue; 0185 } 0186 } 0187 } 0188 0189 /// extract URL from current field 0190 pos = 0; 0191 QRegularExpressionMatch urlRegExpMatch; 0192 while ((urlRegExpMatch = KBibTeX::urlRegExp.match(internalText, pos)).hasMatch()) { 0193 pos = urlRegExpMatch.capturedStart(0); 0194 const QString match = urlRegExpMatch.captured(0); 0195 QUrl url(match); 0196 if (url.isValid() && (testExistence == TestExistence::No || !url.isLocalFile() || QFileInfo::exists(url.toLocalFile())) && !result.contains(url)) 0197 result << url; 0198 /// remove match from internal text to avoid duplicates 0199 internalText = internalText.left(pos) + internalText.mid(pos + match.length()); 0200 } 0201 0202 /// explicitly check URL entry, may be an URL even if http:// or alike is missing 0203 pos = 0; 0204 QRegularExpressionMatch domainNameRegExpMatch; 0205 while ((domainNameRegExpMatch = KBibTeX::domainNameRegExp.match(internalText, pos)).hasMatch()) { 0206 pos = domainNameRegExpMatch.capturedStart(0); 0207 /// URL ends either at space or at string's end 0208 int pos2 = internalText.indexOf(QStringLiteral(" "), pos + 1); 0209 if (pos2 < 0) pos2 = internalText.length(); 0210 QString match = internalText.mid(pos, pos2 - pos); 0211 const QUrl url(QStringLiteral("https://") + match); 0212 if (url.isValid() && !result.contains(url)) 0213 result << url; 0214 /// remove match from internal text to avoid duplicates 0215 internalText = internalText.left(pos) + internalText.mid(pos + match.length()); 0216 } 0217 0218 /// extract general file-like patterns 0219 pos = 0; 0220 QRegularExpressionMatch fileRegExpMatch; 0221 while ((fileRegExpMatch = KBibTeX::fileRegExp.match(internalText, pos)).hasMatch()) { 0222 pos = fileRegExpMatch.capturedStart(0); 0223 const QString match = fileRegExpMatch.captured(0); 0224 const QFileInfo fi(match); 0225 const QUrl url = QUrl::fromLocalFile(!match.startsWith(QStringLiteral("/")) && !match.startsWith(QStringLiteral("http")) && fi.isRelative() && !baseDirectory.isEmpty() ? baseDirectory + QStringLiteral("/") + match : match); 0226 if (url.isValid() && (testExistence == TestExistence::No || QFileInfo::exists(url.toLocalFile())) && !result.contains(url)) 0227 result << url; 0228 /// remove match from internal text to avoid duplicates 0229 internalText = internalText.left(pos) + internalText.mid(pos + match.length()); 0230 } 0231 } 0232 } 0233 0234 QSet<QUrl> FileInfo::entryUrls(const QSharedPointer<const Entry> &entry, const QUrl &bibTeXUrl, TestExistence testExistence) 0235 { 0236 QSet<QUrl> result; 0237 if (entry.isNull() || entry->isEmpty()) 0238 return result; 0239 0240 const QString id = entry->id(); 0241 if (id.length() > 4) { 0242 /// Sometimes the entry id contains or is actually a DOI number 0243 const QRegularExpressionMatch doiRegExpMatch = KBibTeX::doiRegExp.match(id); 0244 if (doiRegExpMatch.hasMatch()) { 0245 const QString match = doiRegExpMatch.captured(QStringLiteral("doi")).remove(QStringLiteral("\\")); 0246 QUrl url(KBibTeX::doiUrlPrefix + match); 0247 result.insert(url); 0248 } 0249 } 0250 if (entry->contains(Entry::ftDOI)) { 0251 const QString doi = PlainTextValue::text(entry->value(Entry::ftDOI)); 0252 QRegularExpressionMatch doiRegExpMatch; 0253 if (!doi.isEmpty() && (doiRegExpMatch = KBibTeX::doiRegExp.match(doi)).hasMatch()) { 0254 const QString match = doiRegExpMatch.captured(QStringLiteral("doi")).remove(QStringLiteral("\\")); 0255 QUrl url(KBibTeX::doiUrlPrefix + match); 0256 result.insert(url); 0257 } 0258 } 0259 static const QString etPMID = QStringLiteral("pmid"); 0260 if (entry->contains(etPMID)) { 0261 const QString pmid = PlainTextValue::text(entry->value(etPMID)); 0262 bool ok = false; 0263 ok &= pmid.toInt(&ok) > 0; 0264 if (ok) { 0265 QUrl url(QStringLiteral("https://www.ncbi.nlm.nih.gov/pubmed/") + pmid); 0266 result.insert(url); 0267 } 0268 } 0269 static const QString etEPrint = QStringLiteral("eprint"); 0270 if (entry->contains(etEPrint)) { 0271 const QString eprint = PlainTextValue::text(entry->value(etEPrint)); 0272 if (!eprint.isEmpty()) { 0273 QUrl url(QStringLiteral("https://arxiv.org/search?query=") + eprint); 0274 result.insert(url); 0275 } 0276 } 0277 0278 const QString baseDirectory = bibTeXUrl.isValid() ? bibTeXUrl.adjusted(QUrl::RemoveFilename | QUrl::StripTrailingSlash).path() : QString(); 0279 0280 for (Entry::ConstIterator it = entry->constBegin(); it != entry->constEnd(); ++it) { 0281 /// skip abstracts, they contain sometimes strange text fragments 0282 /// that are mistaken for URLs 0283 if (it.key().toLower() == Entry::ftAbstract) continue; 0284 0285 const Value v = it.value(); 0286 for (const auto &valueItem : v) { 0287 QString plainText = PlainTextValue::text(*valueItem); 0288 0289 static const QRegularExpression regExpEscapedChars = QRegularExpression(QStringLiteral("\\\\+([&_~])")); 0290 plainText.replace(regExpEscapedChars, QStringLiteral("\\1")); 0291 0292 urlsInText(plainText, testExistence, baseDirectory, result); 0293 } 0294 } 0295 0296 if (!baseDirectory.isEmpty()) { 0297 /// File types supported by "document preview" 0298 static const QStringList documentFileExtensions {QStringLiteral(".pdf"), QStringLiteral(".pdf.gz"), QStringLiteral(".pdf.bz2"), QStringLiteral(".ps"), QStringLiteral(".ps.gz"), QStringLiteral(".ps.bz2"), QStringLiteral(".eps"), QStringLiteral(".eps.gz"), QStringLiteral(".eps.bz2"), QStringLiteral(".html"), QStringLiteral(".xhtml"), QStringLiteral(".htm"), QStringLiteral(".dvi"), QStringLiteral(".djvu"), QStringLiteral(".wwf"), QStringLiteral(".jpeg"), QStringLiteral(".jpg"), QStringLiteral(".png"), QStringLiteral(".gif"), QStringLiteral(".tif"), QStringLiteral(".tiff")}; 0299 result.reserve(result.size() + documentFileExtensions.size() * 2); 0300 0301 /// check if in the same directory as the BibTeX file 0302 /// a PDF file exists which filename is based on the entry's id 0303 for (const QString &extension : documentFileExtensions) { 0304 const QFileInfo fi(baseDirectory + QDir::separator() + entry->id() + extension); 0305 if (fi.exists()) { 0306 const QUrl url = QUrl::fromLocalFile(fi.absoluteFilePath()); 0307 if (!result.contains(url)) 0308 result << url; 0309 } 0310 } 0311 0312 /// Check if in the same directory as the BibTeX file there is a subdirectory 0313 /// similar to the BibTeX file's name and which contains a PDF file exists 0314 /// which filename is based on the entry's id 0315 const QFileInfo filenameInfo(bibTeXUrl.fileName()); 0316 const QString ending = filenameInfo.completeSuffix(); 0317 QString directory = baseDirectory + QDir::separator() + bibTeXUrl.fileName(); 0318 directory.chop(ending.length() + 1); 0319 const QFileInfo fi(directory); 0320 if (fi.isDir()) 0321 for (const QString &extension : documentFileExtensions) { 0322 const QFileInfo fi(directory + QDir::separator() + entry->id() + extension); 0323 if (fi.exists()) { 0324 const QUrl url = QUrl::fromLocalFile(fi.absoluteFilePath()); 0325 if (!result.contains(url)) 0326 result << url; 0327 } 0328 } 0329 } 0330 0331 return result; 0332 } 0333 0334 #ifdef HAVE_POPPLERQT5 0335 QString FileInfo::pdfToText(const QString &pdfFilename) 0336 { 0337 /// Build filename for text file where PDF file's plain text is cached 0338 const QString cacheDirectory = QStandardPaths::writableLocation(QStandardPaths::CacheLocation) + QStringLiteral("/pdftotext"); 0339 if (!QDir(cacheDirectory).exists() && !QDir::home().mkdir(cacheDirectory)) 0340 /// Could not create cache directory 0341 return QString(); 0342 static const QRegularExpression invalidChars(QStringLiteral("[^-a-z0-9_]"), QRegularExpression::CaseInsensitiveOption); 0343 const QString textFilename = QString(pdfFilename).remove(invalidChars).append(QStringLiteral(".txt")).prepend(QStringLiteral("/")).prepend(cacheDirectory); 0344 0345 /// First, check if there is a cache text file 0346 if (QFileInfo::exists(textFilename)) { 0347 /// Load text from cache file 0348 QFile f(textFilename); 0349 if (f.open(QFile::ReadOnly)) { 0350 const QString text = QString::fromUtf8(f.readAll()); 0351 f.close(); 0352 return text; 0353 } 0354 } else { 0355 #ifdef HAVE_QTCONCURRENT 0356 /// No cache file exists, so run text extraction in another thread 0357 QtConcurrent::run(extractPDFTextToCache, pdfFilename, textFilename); 0358 #else // HAVE_QTCONCURRENT 0359 extractPDFTextToCache(pdfFilename, textFilename); 0360 #endif // HAVE_QTCONCURRENT 0361 } 0362 0363 return QString(); 0364 } 0365 0366 void FileInfo::extractPDFTextToCache(const QString &pdfFilename, const QString &cacheFilename) { 0367 /// In case of multiple calls, skip text extraction if cache file already exists 0368 if (QFile(cacheFilename).exists()) return; 0369 0370 QString text; 0371 QStringList msgList; 0372 0373 /// Load PDF file through Poppler 0374 Poppler::Document *doc = Poppler::Document::load(pdfFilename); 0375 if (doc != nullptr) { 0376 static const int maxPages = 64; 0377 /// Build text by appending each page's text 0378 for (int i = 0; i < qMin(maxPages, doc->numPages()); ++i) 0379 text.append(doc->page(i)->text(QRect())).append(QStringLiteral("\n\n")); 0380 if (doc->numPages() > maxPages) 0381 msgList << QString(QStringLiteral("### Skipped %1 pages as PDF file contained too many pages (limit is %2 pages) ###")).arg(doc->numPages() - maxPages).arg(maxPages); 0382 delete doc; 0383 } else 0384 msgList << QStringLiteral("### Skipped as file could not be opened as PDF file ###"); 0385 0386 /// Save text in cache file 0387 QFile f(cacheFilename); 0388 if (f.open(QFile::WriteOnly)) { 0389 static const int maxCharacters = 1 << 18; 0390 f.write(text.left(maxCharacters).toUtf8()); ///< keep only the first 2^18 many characters 0391 0392 if (text.length() > maxCharacters) 0393 msgList << QString(QStringLiteral("### Text too long, skipping %1 characters ###")).arg(text.length() - maxCharacters); 0394 /// Write all messages (warnings) to end of text file 0395 for (const QString &msg : const_cast<const QStringList &>(msgList)) { 0396 static const char linebreak = '\n'; 0397 f.write(&linebreak, 1); 0398 f.write(msg.toUtf8()); 0399 } 0400 0401 f.close(); 0402 } 0403 } 0404 #endif // HAVE_POPPLERQT5