File indexing completed on 2024-05-19 05:05:37

0001 /***************************************************************************
0002  *   SPDX-License-Identifier: GPL-2.0-or-later
0003  *                                                                         *
0004  *   SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  *   This program is distributed in the hope that it will be useful,       *
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0014  *   GNU General Public License for more details.                          *
0015  *                                                                         *
0016  *   You should have received a copy of the GNU General Public License     *
0017  *   along with this program; if not, see <https://www.gnu.org/licenses/>. *
0018  ***************************************************************************/
0019 
0020 #include "fileinfo.h"
0021 
0022 #ifdef HAVE_POPPLERQT5
0023 #include <poppler-qt5.h>
0024 #endif // HAVE_POPPLERQT5
0025 
0026 #include <QFileInfo>
0027 #include <QMimeDatabase>
0028 #include <QDir>
0029 #include <QTextStream>
0030 #include <QStandardPaths>
0031 #include <QRegularExpression>
0032 #ifdef HAVE_QTCONCURRENT
0033 #include <QtConcurrentRun>
0034 #endif // HAVE_QTCONCURRENT
0035 
0036 #include <KBibTeX>
0037 #include <Entry>
0038 #include "logging_io.h"
0039 
0040 FileInfo::FileInfo()
0041 {
0042     /// nothing
0043 }
0044 
0045 const QString FileInfo::mimetypeOctetStream = QStringLiteral("application/octet-stream");
0046 const QString FileInfo::mimetypeHTML = QStringLiteral("text/html");
0047 const QString FileInfo::mimetypeBibTeX = QStringLiteral("text/x-bibtex");
0048 const QString FileInfo::mimetypeRIS = QStringLiteral("application/x-research-info-systems");
0049 const QString FileInfo::mimetypePDF = QStringLiteral("application/pdf");
0050 
0051 QMimeType FileInfo::mimeTypeForUrl(const QUrl &url)
0052 {
0053     if (!url.isValid()) {
0054         qCWarning(LOG_KBIBTEX_IO) << "Cannot determine mime type for empty or invalid QUrl";
0055         return QMimeType(); ///< invalid input gives invalid mime type
0056     }
0057 
0058     static const QMimeDatabase db;
0059     static const QMimeType mtHTML(db.mimeTypeForName(mimetypeHTML));
0060     static const QMimeType mtOctetStream(db.mimeTypeForName(mimetypeOctetStream));
0061     static const QMimeType mtBibTeX(db.mimeTypeForName(mimetypeBibTeX));
0062     static const QMimeType mtPDF(db.mimeTypeForName(mimetypePDF));
0063     static const QMimeType mtRIS(db.mimeTypeForName(mimetypeRIS));
0064 
0065     /// Test if mime type for BibTeX is registered before determining file extension
0066     static const QString mimetypeBibTeXExt = mtBibTeX.preferredSuffix();
0067     /// Test if mime type for RIS is registered before determining file extension
0068     static const QString mimetypeRISExt = mtRIS.preferredSuffix();
0069     /// Test if mime type for PDF is registered before determining file extension
0070     static const QString mimetypePDFExt = mtPDF.preferredSuffix();
0071 
0072     const QString extension = db.suffixForFileName(url.fileName()).toLower();
0073     /// First, check preferred suffixes
0074     if (mtBibTeX.isValid() && extension == mimetypeBibTeXExt)
0075         return mtBibTeX;
0076     else if (mtRIS.isValid() && extension == mimetypeRISExt)
0077         return mtRIS;
0078     else if (extension == mimetypePDFExt)
0079         return mtPDF;
0080     /// Second, check any other suffixes
0081     else if (mtBibTeX.isValid() && mtBibTeX.suffixes().contains(extension))
0082         return mtBibTeX;
0083     else if (mtRIS.isValid() && mtRIS.suffixes().contains(extension))
0084         return mtRIS;
0085     else if (mtPDF.suffixes().contains(extension))
0086         return mtPDF;
0087 
0088     /// Let the KDE subsystem guess the mime type
0089     QMimeType result = db.mimeTypeForUrl(url);
0090     /// Fall back to application/octet-stream if something goes wrong
0091     if (!result.isValid())
0092         result = mtOctetStream;
0093 
0094     /// In case that KDE could not determine mime type,
0095     /// do some educated guesses on our own
0096     if (result.name() == mimetypeOctetStream) {
0097         if (url.scheme().startsWith(QStringLiteral("http")))
0098             result = mtHTML;
0099         // TODO more tests?
0100     }
0101 
0102     return result;
0103 }
0104 
0105 void FileInfo::urlsInText(const QString &text, const TestExistence testExistence, const QString &baseDirectory, QSet<QUrl> &result)
0106 {
0107     if (text.isEmpty())
0108         return;
0109 
0110     /// DOI identifiers have to extracted first as KBibTeX::fileListSeparatorRegExp
0111     /// contains characters that can be part of a DOI (e.g. ';') and thus could split
0112     /// a DOI in between.
0113     QString internalText = text;
0114     int pos = 0;
0115     QRegularExpressionMatch doiRegExpMatch;
0116     while ((doiRegExpMatch = KBibTeX::doiRegExp.match(internalText, pos)).hasMatch()) {
0117         pos = doiRegExpMatch.capturedStart(QStringLiteral("doi"));
0118         QString doiMatch = doiRegExpMatch.captured(QStringLiteral("doi"));
0119         const int semicolonHttpPos = doiMatch.indexOf(QStringLiteral(";http"));
0120         if (semicolonHttpPos > 0) doiMatch = doiMatch.left(semicolonHttpPos);
0121         const QUrl url(KBibTeX::doiUrlPrefix + QString(doiMatch).remove(QStringLiteral("\\")));
0122         if (url.isValid() && !result.contains(url))
0123             result << url;
0124         /// remove match from internal text to avoid duplicates
0125 
0126         /// Cut away any URL that may be right before found DOI number:
0127         /// For example, if DOI '10.1000/38-abc' was found in
0128         ///   'Lore ipsum http://doi.example.org/10.1000/38-abc Lore ipsum'
0129         /// also remove 'http://doi.example.org/' from the text, keeping only
0130         ///   'Lore ipsum  Lore ipsum'
0131         static const QRegularExpression genericDoiUrlPrefix(QStringLiteral("http[s]?://[a-z0-9./-]+/$")); ///< looks like an URL
0132         const QRegularExpressionMatch genericDoiUrlPrefixMatch = genericDoiUrlPrefix.match(internalText.left(pos));
0133         if (genericDoiUrlPrefixMatch.hasMatch())
0134             /// genericDoiUrlPrefixMatch.captured(0) may contain (parts of) DOI
0135             internalText = internalText.left(genericDoiUrlPrefixMatch.capturedStart(0)) + internalText.mid(pos + doiMatch.length());
0136         else
0137             internalText = internalText.left(pos) + internalText.mid(pos + doiMatch.length());
0138     }
0139 
0140 #if QT_VERSION >= 0x050e00
0141     const QStringList fileList = internalText.split(KBibTeX::fileListSeparatorRegExp, Qt::SkipEmptyParts);
0142 #else // QT_VERSION < 0x050e00
0143     const QStringList fileList = internalText.split(KBibTeX::fileListSeparatorRegExp, QString::SkipEmptyParts);
0144 #endif // QT_VERSION >= 0x050e00
0145     for (const QString &text : fileList) {
0146         internalText = text;
0147 
0148         /// If testing for the actual existence of a filename found in the text ...
0149         if (testExistence == TestExistence::Yes) {
0150             /// If a base directory (e.g. the location of the parent .bib file) is given
0151             /// and the potential filename fragment is NOT an absolute path, ...
0152             if (internalText.startsWith(QStringLiteral("~") + QDir::separator())) {
0153                 const QString fullFilename = QDir::homePath() + internalText.mid(1);
0154                 const QFileInfo fileInfo(fullFilename);
0155                 const QUrl url = QUrl::fromLocalFile(fileInfo.absoluteFilePath());
0156                 if (fileInfo.exists() && fileInfo.isFile() && url.isValid() && !result.contains(url)) {
0157                     result << url;
0158                     /// Stop searching for URLs or filenames in current internal text
0159                     continue;
0160                 }
0161             } else if (!baseDirectory.isEmpty() &&
0162                        // TODO the following test assumes that absolute paths start
0163                        // with a dir separator, which may only be true on Unix/Linux,
0164                        // but not Windows. May be a test for 'first character is a letter,
0165                        // second is ":", third is "\"' may be necessary.
0166                        !internalText.startsWith(QDir::separator())) {
0167                 /// To get the absolute path, prepend filename fragment with base directory
0168                 const QString fullFilename = baseDirectory + QDir::separator() + internalText;
0169                 const QFileInfo fileInfo(fullFilename);
0170                 const QUrl url = QUrl::fromLocalFile(fileInfo.absoluteFilePath());
0171                 if (fileInfo.exists() && fileInfo.isFile() && url.isValid() && !result.contains(url)) {
0172                     result << url;
0173                     /// Stop searching for URLs or filenames in current internal text
0174                     continue;
0175                 }
0176             } else {
0177                 /// Either the filename fragment is an absolute path OR no base directory
0178                 /// was given (current working directory is assumed), ...
0179                 const QFileInfo fileInfo(internalText);
0180                 const QUrl url = QUrl::fromLocalFile(fileInfo.absoluteFilePath());
0181                 if (fileInfo.exists() && fileInfo.isFile() && url.isValid() && !result.contains(url)) {
0182                     result << url;
0183                     /// stop searching for URLs or filenames in current internal text
0184                     continue;
0185                 }
0186             }
0187         }
0188 
0189         /// extract URL from current field
0190         pos = 0;
0191         QRegularExpressionMatch urlRegExpMatch;
0192         while ((urlRegExpMatch = KBibTeX::urlRegExp.match(internalText, pos)).hasMatch()) {
0193             pos = urlRegExpMatch.capturedStart(0);
0194             const QString match = urlRegExpMatch.captured(0);
0195             QUrl url(match);
0196             if (url.isValid() && (testExistence == TestExistence::No || !url.isLocalFile() || QFileInfo::exists(url.toLocalFile())) && !result.contains(url))
0197                 result << url;
0198             /// remove match from internal text to avoid duplicates
0199             internalText = internalText.left(pos) + internalText.mid(pos + match.length());
0200         }
0201 
0202         /// explicitly check URL entry, may be an URL even if http:// or alike is missing
0203         pos = 0;
0204         QRegularExpressionMatch domainNameRegExpMatch;
0205         while ((domainNameRegExpMatch = KBibTeX::domainNameRegExp.match(internalText, pos)).hasMatch()) {
0206             pos = domainNameRegExpMatch.capturedStart(0);
0207             /// URL ends either at space or at string's end
0208             int pos2 = internalText.indexOf(QStringLiteral(" "), pos + 1);
0209             if (pos2 < 0) pos2 = internalText.length();
0210             QString match = internalText.mid(pos, pos2 - pos);
0211             const QUrl url(QStringLiteral("https://") + match);
0212             if (url.isValid() && !result.contains(url))
0213                 result << url;
0214             /// remove match from internal text to avoid duplicates
0215             internalText = internalText.left(pos) + internalText.mid(pos + match.length());
0216         }
0217 
0218         /// extract general file-like patterns
0219         pos = 0;
0220         QRegularExpressionMatch fileRegExpMatch;
0221         while ((fileRegExpMatch = KBibTeX::fileRegExp.match(internalText, pos)).hasMatch()) {
0222             pos = fileRegExpMatch.capturedStart(0);
0223             const QString match = fileRegExpMatch.captured(0);
0224             const QFileInfo fi(match);
0225             const QUrl url = QUrl::fromLocalFile(!match.startsWith(QStringLiteral("/")) && !match.startsWith(QStringLiteral("http")) && fi.isRelative() && !baseDirectory.isEmpty() ? baseDirectory + QStringLiteral("/") + match : match);
0226             if (url.isValid() && (testExistence == TestExistence::No || QFileInfo::exists(url.toLocalFile())) && !result.contains(url))
0227                 result << url;
0228             /// remove match from internal text to avoid duplicates
0229             internalText = internalText.left(pos) + internalText.mid(pos + match.length());
0230         }
0231     }
0232 }
0233 
0234 QSet<QUrl> FileInfo::entryUrls(const QSharedPointer<const Entry> &entry, const QUrl &bibTeXUrl, TestExistence testExistence)
0235 {
0236     QSet<QUrl> result;
0237     if (entry.isNull() || entry->isEmpty())
0238         return result;
0239 
0240     const QString id = entry->id();
0241     if (id.length() > 4) {
0242         /// Sometimes the entry id contains or is actually a DOI number
0243         const QRegularExpressionMatch doiRegExpMatch = KBibTeX::doiRegExp.match(id);
0244         if (doiRegExpMatch.hasMatch()) {
0245             const QString match = doiRegExpMatch.captured(QStringLiteral("doi")).remove(QStringLiteral("\\"));
0246             QUrl url(KBibTeX::doiUrlPrefix + match);
0247             result.insert(url);
0248         }
0249     }
0250     if (entry->contains(Entry::ftDOI)) {
0251         const QString doi = PlainTextValue::text(entry->value(Entry::ftDOI));
0252         QRegularExpressionMatch doiRegExpMatch;
0253         if (!doi.isEmpty() && (doiRegExpMatch = KBibTeX::doiRegExp.match(doi)).hasMatch()) {
0254             const QString match = doiRegExpMatch.captured(QStringLiteral("doi")).remove(QStringLiteral("\\"));
0255             QUrl url(KBibTeX::doiUrlPrefix + match);
0256             result.insert(url);
0257         }
0258     }
0259     static const QString etPMID = QStringLiteral("pmid");
0260     if (entry->contains(etPMID)) {
0261         const QString pmid = PlainTextValue::text(entry->value(etPMID));
0262         bool ok = false;
0263         ok &= pmid.toInt(&ok) > 0;
0264         if (ok) {
0265             QUrl url(QStringLiteral("https://www.ncbi.nlm.nih.gov/pubmed/") + pmid);
0266             result.insert(url);
0267         }
0268     }
0269     static const QString etEPrint = QStringLiteral("eprint");
0270     if (entry->contains(etEPrint)) {
0271         const QString eprint = PlainTextValue::text(entry->value(etEPrint));
0272         if (!eprint.isEmpty()) {
0273             QUrl url(QStringLiteral("https://arxiv.org/search?query=") + eprint);
0274             result.insert(url);
0275         }
0276     }
0277 
0278     const QString baseDirectory = bibTeXUrl.isValid() ? bibTeXUrl.adjusted(QUrl::RemoveFilename | QUrl::StripTrailingSlash).path() : QString();
0279 
0280     for (Entry::ConstIterator it = entry->constBegin(); it != entry->constEnd(); ++it) {
0281         /// skip abstracts, they contain sometimes strange text fragments
0282         /// that are mistaken for URLs
0283         if (it.key().toLower() == Entry::ftAbstract) continue;
0284 
0285         const Value v = it.value();
0286         for (const auto &valueItem : v) {
0287             QString plainText = PlainTextValue::text(*valueItem);
0288 
0289             static const QRegularExpression regExpEscapedChars = QRegularExpression(QStringLiteral("\\\\+([&_~])"));
0290             plainText.replace(regExpEscapedChars, QStringLiteral("\\1"));
0291 
0292             urlsInText(plainText, testExistence, baseDirectory, result);
0293         }
0294     }
0295 
0296     if (!baseDirectory.isEmpty()) {
0297         /// File types supported by "document preview"
0298         static const QStringList documentFileExtensions {QStringLiteral(".pdf"), QStringLiteral(".pdf.gz"), QStringLiteral(".pdf.bz2"), QStringLiteral(".ps"), QStringLiteral(".ps.gz"), QStringLiteral(".ps.bz2"), QStringLiteral(".eps"), QStringLiteral(".eps.gz"), QStringLiteral(".eps.bz2"), QStringLiteral(".html"), QStringLiteral(".xhtml"), QStringLiteral(".htm"), QStringLiteral(".dvi"), QStringLiteral(".djvu"), QStringLiteral(".wwf"), QStringLiteral(".jpeg"), QStringLiteral(".jpg"), QStringLiteral(".png"), QStringLiteral(".gif"), QStringLiteral(".tif"), QStringLiteral(".tiff")};
0299         result.reserve(result.size() + documentFileExtensions.size() * 2);
0300 
0301         /// check if in the same directory as the BibTeX file
0302         /// a PDF file exists which filename is based on the entry's id
0303         for (const QString &extension : documentFileExtensions) {
0304             const QFileInfo fi(baseDirectory + QDir::separator() + entry->id() + extension);
0305             if (fi.exists()) {
0306                 const QUrl url = QUrl::fromLocalFile(fi.absoluteFilePath());
0307                 if (!result.contains(url))
0308                     result << url;
0309             }
0310         }
0311 
0312         /// Check if in the same directory as the BibTeX file there is a subdirectory
0313         /// similar to the BibTeX file's name and which contains a PDF file exists
0314         /// which filename is based on the entry's id
0315         const QFileInfo filenameInfo(bibTeXUrl.fileName());
0316         const QString ending = filenameInfo.completeSuffix();
0317         QString directory = baseDirectory + QDir::separator() + bibTeXUrl.fileName();
0318         directory.chop(ending.length() + 1);
0319         const QFileInfo fi(directory);
0320         if (fi.isDir())
0321             for (const QString &extension : documentFileExtensions) {
0322                 const QFileInfo fi(directory + QDir::separator() + entry->id() + extension);
0323                 if (fi.exists()) {
0324                     const QUrl url = QUrl::fromLocalFile(fi.absoluteFilePath());
0325                     if (!result.contains(url))
0326                         result << url;
0327                 }
0328             }
0329     }
0330 
0331     return result;
0332 }
0333 
0334 #ifdef HAVE_POPPLERQT5
0335 QString FileInfo::pdfToText(const QString &pdfFilename)
0336 {
0337     /// Build filename for text file where PDF file's plain text is cached
0338     const QString cacheDirectory = QStandardPaths::writableLocation(QStandardPaths::CacheLocation) + QStringLiteral("/pdftotext");
0339     if (!QDir(cacheDirectory).exists() && !QDir::home().mkdir(cacheDirectory))
0340         /// Could not create cache directory
0341         return QString();
0342     static const QRegularExpression invalidChars(QStringLiteral("[^-a-z0-9_]"), QRegularExpression::CaseInsensitiveOption);
0343     const QString textFilename = QString(pdfFilename).remove(invalidChars).append(QStringLiteral(".txt")).prepend(QStringLiteral("/")).prepend(cacheDirectory);
0344 
0345     /// First, check if there is a cache text file
0346     if (QFileInfo::exists(textFilename)) {
0347         /// Load text from cache file
0348         QFile f(textFilename);
0349         if (f.open(QFile::ReadOnly)) {
0350             const QString text = QString::fromUtf8(f.readAll());
0351             f.close();
0352             return text;
0353         }
0354     } else {
0355 #ifdef HAVE_QTCONCURRENT
0356         /// No cache file exists, so run text extraction in another thread
0357         QtConcurrent::run(extractPDFTextToCache, pdfFilename, textFilename);
0358 #else // HAVE_QTCONCURRENT
0359         extractPDFTextToCache(pdfFilename, textFilename);
0360 #endif // HAVE_QTCONCURRENT
0361     }
0362 
0363     return QString();
0364 }
0365 
0366 void FileInfo::extractPDFTextToCache(const QString &pdfFilename, const QString &cacheFilename) {
0367     /// In case of multiple calls, skip text extraction if cache file already exists
0368     if (QFile(cacheFilename).exists()) return;
0369 
0370     QString text;
0371     QStringList msgList;
0372 
0373     /// Load PDF file through Poppler
0374     Poppler::Document *doc = Poppler::Document::load(pdfFilename);
0375     if (doc != nullptr) {
0376         static const int maxPages = 64;
0377         /// Build text by appending each page's text
0378         for (int i = 0; i < qMin(maxPages, doc->numPages()); ++i)
0379             text.append(doc->page(i)->text(QRect())).append(QStringLiteral("\n\n"));
0380         if (doc->numPages() > maxPages)
0381             msgList << QString(QStringLiteral("### Skipped %1 pages as PDF file contained too many pages (limit is %2 pages) ###")).arg(doc->numPages() - maxPages).arg(maxPages);
0382         delete doc;
0383     } else
0384         msgList << QStringLiteral("### Skipped as file could not be opened as PDF file ###");
0385 
0386     /// Save text in cache file
0387     QFile f(cacheFilename);
0388     if (f.open(QFile::WriteOnly)) {
0389         static const int maxCharacters = 1 << 18;
0390         f.write(text.left(maxCharacters).toUtf8()); ///< keep only the first 2^18 many characters
0391 
0392         if (text.length() > maxCharacters)
0393             msgList << QString(QStringLiteral("### Text too long, skipping %1 characters ###")).arg(text.length() - maxCharacters);
0394         /// Write all messages (warnings) to end of text file
0395         for (const QString &msg : const_cast<const QStringList &>(msgList)) {
0396             static const char linebreak = '\n';
0397             f.write(&linebreak, 1);
0398             f.write(msg.toUtf8());
0399         }
0400 
0401         f.close();
0402     }
0403 }
0404 #endif // HAVE_POPPLERQT5