File indexing completed on 2024-05-12 05:10:12

0001 /***************************************************************************
0002     Copyright (C) 2007-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "pdfimporter.h"
0026 #include "tellicoimporter.h"
0027 #include "xslthandler.h"
0028 #include "xmphandler.h"
0029 #include "../collections/bookcollection.h"
0030 #include "../collections/bibtexcollection.h"
0031 #include "../fieldformat.h"
0032 #include "../core/filehandler.h"
0033 #include "../core/netaccess.h"
0034 #include "../images/imagefactory.h"
0035 #include "../utils/guiproxy.h"
0036 #include "../fetch/fetchmanager.h"
0037 #include "../progressmanager.h"
0038 #include "../utils/cursorsaver.h"
0039 #include "../entryupdatejob.h"
0040 #include "../utils/datafileregistry.h"
0041 #include "../tellico_debug.h"
0042 
0043 #include <KMessageBox>
0044 #include <KLocalizedString>
0045 
0046 #include <QString>
0047 #include <QPixmap>
0048 #include <QApplication>
0049 #include <QFile>
0050 #include <QScopedPointer>
0051 
0052 #include <config.h>
0053 #ifdef HAVE_POPPLER
0054 #include <poppler-qt5.h>
0055 #endif
0056 
0057 namespace {
0058   static const int PDF_FILE_PREVIEW_SIZE = 196;
0059 }
0060 
0061 using Tellico::Import::PDFImporter;
0062 
0063 PDFImporter::PDFImporter(const QUrl& url_) : Importer(url_), m_cancelled(false) {
0064 }
0065 
0066 PDFImporter::PDFImporter(const QList<QUrl>& urls_) : Importer(urls_), m_cancelled(false) {
0067 }
0068 
0069 bool PDFImporter::canImport(int type_) const {
0070   return type_ == Data::Collection::Book || type_ == Data::Collection::Bibtex;
0071 }
0072 
0073 Tellico::Data::CollPtr PDFImporter::collection() {
0074   QString xsltFile = DataFileRegistry::self()->locate(QStringLiteral("xmp2tellico.xsl"));
0075   if(xsltFile.isEmpty()) {
0076     myWarning() << "can not locate xmp2tellico.xsl";
0077     return Data::CollPtr();
0078   }
0079 
0080   ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true);
0081   item.setTotalSteps(urls().count());
0082   connect(&item, &Tellico::ProgressItem::signalCancelled, this, &Tellico::Import::PDFImporter::slotCancel);
0083   ProgressItem::Done done(this);
0084   const bool showProgress = options() & ImportProgress;
0085 
0086   QUrl u = QUrl::fromLocalFile(xsltFile);
0087 
0088   XSLTHandler xsltHandler(u);
0089   if(!xsltHandler.isValid()) {
0090     myWarning() << "invalid xslt in xmp2tellico.xsl";
0091     return Data::CollPtr();
0092   }
0093   bool isBook = false;
0094   if(currentCollection() && currentCollection()->type() == Data::Collection::Book) {
0095     xsltHandler.addStringParam("ctype", "2"); // book if already existing
0096     isBook = true;
0097   } else {
0098     xsltHandler.addStringParam("ctype", "5"); // bibtex by default
0099   }
0100 
0101   bool hasDOI = false;
0102   bool hasArxiv = false;
0103 
0104   uint j = 0;
0105 
0106   Data::CollPtr coll;
0107   XMPHandler xmpHandler;
0108   QList<QUrl> list = urls();
0109   for(QList<QUrl>::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) {
0110     const QScopedPointer<FileHandler::FileRef> ref(FileHandler::fileRef(*it));
0111     if(!ref->isValid()) {
0112       continue;
0113     }
0114 
0115     Data::CollPtr newColl;
0116     Data::EntryPtr entry;
0117 
0118     QString xmp = xmpHandler.extractXMP(ref->fileName());
0119     //  myDebug() << xmp;
0120     if(xmp.isEmpty()) {
0121       setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
0122     } else {
0123       setStatusMessage(QString());
0124 #if 0
0125       myWarning() << "Remove debug from pdfimporter.cpp";
0126       QFile f(QString::fromLatin1("/tmp/test-xmp.xml"));
0127       if(f.open(QIODevice::WriteOnly)) {
0128         QTextStream t(&f);
0129         t.setCodec("UTF-8");
0130         t << xmp;
0131       }
0132       f.close();
0133 #endif
0134       Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp));
0135       newColl = importer.collection();
0136       if(!newColl || newColl->entryCount() == 0) {
0137         myWarning() << "no collection found";
0138         setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
0139       } else {
0140         entry = newColl->entries().front();
0141         hasDOI |= !entry->field(QStringLiteral("doi")).isEmpty();
0142         // the XMP handler has a habit of inserting empty values surrounded by parentheses
0143         static const QRegularExpression rx(QLatin1String("^\\(\\s*\\)$"));
0144         foreach(Data::FieldPtr field, newColl->fields()) {
0145           QString value = entry->field(field);
0146           if(value.contains(rx)) {
0147             entry->setField(field, QString());
0148           }
0149         }
0150       }
0151     }
0152 
0153 #ifdef HAVE_POPPLER
0154     if(!newColl) {
0155       if(isBook) {
0156         newColl = new Data::BookCollection(true);
0157       } else {
0158         newColl = new Data::BibtexCollection(true);
0159       }
0160     }
0161     if(!entry) {
0162       entry = new Data::Entry(newColl);
0163       newColl->addEntries(entry);
0164     }
0165 
0166     // now load from poppler
0167     Poppler::Document* doc = Poppler::Document::load(ref->fileName());
0168     if(doc && !doc->isLocked()) {
0169       // now the question is, do we overwrite XMP data with Poppler data?
0170       // for now, let's say yes conditionally
0171       QString s = doc->info(QStringLiteral("Title")).simplified();
0172       if(!s.isEmpty()) {
0173         entry->setField(QStringLiteral("title"), s);
0174       }
0175       // author could be separated by commas, "and" or whatever
0176       // we're not going to overwrite it
0177       if(entry->field(QStringLiteral("author")).isEmpty()) {
0178         static const QRegularExpression rx(QLatin1String("\\s*(\\s+and\\s+|,|;)\\s*"));
0179         QStringList authors = doc->info(QStringLiteral("Author")).simplified().split(rx);
0180         entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString()));
0181       }
0182       s = doc->info(QStringLiteral("Keywords")).simplified();
0183       if(s.isEmpty()) {
0184         s = doc->info(QStringLiteral("Subject")).simplified();
0185       }
0186       if(!s.isEmpty()) {
0187         // keywords are also separated by semi-colons in poppler
0188         entry->setField(QStringLiteral("keyword"), s);
0189       }
0190 
0191       // now parse the first page text and try to guess
0192       Poppler::Page* page = doc->page(0);
0193       if(page) {
0194         // a null rectangle means get all text on page
0195         QString text = page->text(QRectF());
0196         // borrowed from Referencer
0197         static const QRegularExpression doiRx(QLatin1String("(?:"
0198                                                             "(?:[Dd][Oo][Ii]:? *)"
0199                                                             "|"
0200                                                             "(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)"
0201                                                             ")"
0202                                                             "("
0203                                                             "[^\\.\\s]+"
0204                                                             "\\."
0205                                                             "[^\\/\\s]+"
0206                                                             "\\/"
0207                                                             "[^\\s]+"
0208                                                             ")"));
0209         QRegularExpressionMatch m = doiRx.match(text);
0210         if(!m.hasMatch()) {
0211           static const QRegularExpression doiUrlRx(QLatin1String("https?://(?:dx\\.)?doi\\.org/(10.\\d{4,9}/[-._;()/:a-zA-Z0-9]+)"));
0212           m = doiUrlRx.match(text);
0213         }
0214         if(m.hasMatch()) {
0215           const QString doi = m.captured(1);
0216           myLog() << "In PDF file, found DOI:" << doi;
0217           entry->setField(QStringLiteral("doi"), doi);
0218           hasDOI = true;
0219         }
0220         static const QRegularExpression arxivRx(QLatin1String("arXiv:"
0221                                                               "("
0222                                                               "[^\\/\\s]+"
0223                                                               "[\\/\\.]"
0224                                                               "[^\\s]+"
0225                                                               ")"));
0226         m = arxivRx.match(text);
0227         if(m.hasMatch()) {
0228           const QString arxiv = m.captured(1);
0229           myLog() << "in PDF file, found arxiv:" << arxiv;
0230           if(!entry->collection()->hasField(QStringLiteral("arxiv"))) {
0231             Data::FieldPtr field(new Data::Field(QStringLiteral("arxiv"), i18n("arXiv ID")));
0232             field->setCategory(i18n("Publishing"));
0233             entry->collection()->addField(field);
0234           }
0235           entry->setField(QStringLiteral("arxiv"), arxiv);
0236           hasArxiv = true;
0237         }
0238 
0239         delete page;
0240       }
0241     } else {
0242       myDebug() << "unable to read PDF info (poppler)";
0243     }
0244     delete doc;
0245 #elif defined HAVE_KFILEMETADATA
0246     if(!newColl || newColl->entryCount() == 0) {
0247       myDebug() << "Reading with metadata";
0248       EBookImporter imp(urls());
0249       auto ebookColl = imp.collection();
0250       if(ebookColl && ebookColl->type() == Data::Collection::Book && !isBook) {
0251         newColl = Data::BibtexCollection::convertBookCollection(ebookColl);
0252       } else {
0253         newColl = ebookColl;
0254       }
0255       if(newColl->entryCount() > 0) {
0256         entry = new Data::Entry(newColl);
0257         newColl->addEntries(entry);
0258       } else {
0259         entry = newColl->entries().front();
0260       }
0261     }
0262 #else
0263     // only recourse is to create an empty collection
0264     if(!newColl) {
0265       if(isBook) {
0266         newColl = new Data::BookCollection(true);
0267       } else {
0268         newColl = new Data::BibtexCollection(true);
0269       }
0270     }
0271     if(!entry) {
0272       entry = new Data::Entry(newColl);
0273       newColl->addEntries(entry);
0274     }
0275 #endif
0276 
0277     if(!isBook) {
0278       entry->setField(QStringLiteral("url"), (*it).url());
0279       // always an article?
0280       entry->setField(QStringLiteral("entry-type"), QStringLiteral("article"));
0281     }
0282     QPixmap pix = NetAccess::filePreview(QUrl::fromLocalFile(ref->fileName()), PDF_FILE_PREVIEW_SIZE);
0283     if(pix.isNull()) {
0284       myDebug() << "No file preview from pdf";
0285     } else {
0286       // is png best option?
0287       QString id = ImageFactory::addImage(pix, QStringLiteral("PNG"));
0288       if(!id.isEmpty()) {
0289         Data::FieldPtr field = newColl->fieldByName(QStringLiteral("cover"));
0290         if(!field && !newColl->imageFields().isEmpty()) {
0291           field = newColl->imageFields().front();
0292         } else if(!field) {
0293           field = Data::Field::createDefaultField(Data::Field::FrontCoverField);
0294           newColl->addField(field);
0295         }
0296         entry->setField(field, id);
0297       }
0298     }
0299     if(coll) {
0300       coll->addEntries(newColl->entries());
0301     } else {
0302       coll = newColl;
0303     }
0304 
0305     if(showProgress) {
0306       ProgressManager::self()->setProgress(this, j+1);
0307       qApp->processEvents();
0308     }
0309   }
0310 
0311   if(m_cancelled || !coll) {
0312     return Data::CollPtr();
0313   }
0314 
0315   if(hasDOI) {
0316     Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI);
0317     if(vec.isEmpty() && GUI::Proxy::widget()) {
0318       GUI::CursorSaver cs(Qt::ArrowCursor);
0319       KMessageBox::information(GUI::Proxy::widget(),
0320                               i18n("Tellico is able to download information about entries with a DOI from "
0321                                    "CrossRef.org. However, you must create an CrossRef account and add a new "
0322                                    "data source with your account information."),
0323                               QString(),
0324                               QStringLiteral("CrossRefSourceNeeded"));
0325     } else {
0326       foreach(Fetch::Fetcher::Ptr fetcher, vec) {
0327         foreach(Data::EntryPtr entry, coll->entries()) {
0328           KJob* job = new EntryUpdateJob(this, entry, fetcher);
0329           job->exec();
0330         }
0331       }
0332     }
0333   }
0334 
0335   if(m_cancelled) {
0336     return Data::CollPtr();
0337   }
0338 
0339   if(hasArxiv) {
0340     Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID);
0341     foreach(Fetch::Fetcher::Ptr fetcher, vec) {
0342       foreach(Data::EntryPtr entry, coll->entries()) {
0343         KJob* job = new EntryUpdateJob(this, entry, fetcher);
0344         job->exec();
0345       }
0346     }
0347   }
0348 
0349   // finally
0350   foreach(Data::EntryPtr entry, coll->entries()) {
0351     if(entry->title().isEmpty()) {
0352       // use file name
0353       QUrl u = QUrl::fromLocalFile(entry->field(QStringLiteral("url")));
0354       entry->setField(QStringLiteral("title"), u.fileName());
0355     }
0356   }
0357 
0358   if(m_cancelled) {
0359     return Data::CollPtr();
0360   }
0361   return coll;
0362 }
0363 
0364 void PDFImporter::slotCancel() {
0365   m_cancelled = true;
0366 }