File indexing completed on 2024-05-12 16:46:32

0001 /***************************************************************************
0002     Copyright (C) 2007-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "pdfimporter.h"
0026 #include "tellicoimporter.h"
0027 #include "xslthandler.h"
0028 #include "xmphandler.h"
0029 #include "../collections/bibtexcollection.h"
0030 #include "../fieldformat.h"
0031 #include "../core/filehandler.h"
0032 #include "../core/netaccess.h"
0033 #include "../images/imagefactory.h"
0034 #include "../utils/guiproxy.h"
0035 #include "../fetch/fetchmanager.h"
0036 #include "../progressmanager.h"
0037 #include "../utils/cursorsaver.h"
0038 #include "../entryupdatejob.h"
0039 #include "../utils/datafileregistry.h"
0040 #include "../tellico_debug.h"
0041 
0042 #include <KMessageBox>
0043 #include <KLocalizedString>
0044 
0045 #include <QString>
0046 #include <QPixmap>
0047 #include <QApplication>
0048 #include <QFile>
0049 #include <QScopedPointer>
0050 
0051 #include <config.h>
0052 #ifdef HAVE_POPPLER
0053 #include <poppler-qt5.h>
0054 #endif
0055 
0056 namespace {
0057   static const int PDF_FILE_PREVIEW_SIZE = 196;
0058 }
0059 
0060 using Tellico::Import::PDFImporter;
0061 
0062 PDFImporter::PDFImporter(const QUrl& url_) : Importer(url_), m_cancelled(false) {
0063 }
0064 
0065 PDFImporter::PDFImporter(const QList<QUrl>& urls_) : Importer(urls_), m_cancelled(false) {
0066 }
0067 
0068 bool PDFImporter::canImport(int type_) const {
0069   return type_ == Data::Collection::Bibtex;
0070 }
0071 
0072 Tellico::Data::CollPtr PDFImporter::collection() {
0073   QString xsltFile = DataFileRegistry::self()->locate(QStringLiteral("xmp2tellico.xsl"));
0074   if(xsltFile.isEmpty()) {
0075     myWarning() << "can not locate xmp2tellico.xsl";
0076     return Data::CollPtr();
0077   }
0078 
0079   ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true);
0080   item.setTotalSteps(urls().count());
0081   connect(&item, &Tellico::ProgressItem::signalCancelled, this, &Tellico::Import::PDFImporter::slotCancel);
0082   ProgressItem::Done done(this);
0083   const bool showProgress = options() & ImportProgress;
0084 
0085   QUrl u = QUrl::fromLocalFile(xsltFile);
0086 
0087   XSLTHandler xsltHandler(u);
0088   if(!xsltHandler.isValid()) {
0089     myWarning() << "invalid xslt in xmp2tellico.xsl";
0090     return Data::CollPtr();
0091   }
0092 
0093   bool hasDOI = false;
0094   bool hasArxiv = false;
0095 
0096   uint j = 0;
0097 
0098   Data::CollPtr coll;
0099   XMPHandler xmpHandler;
0100   QList<QUrl> list = urls();
0101   for(QList<QUrl>::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) {
0102     const QScopedPointer<FileHandler::FileRef> ref(FileHandler::fileRef(*it));
0103     if(!ref->isValid()) {
0104       continue;
0105     }
0106 
0107     Data::CollPtr newColl;
0108     Data::EntryPtr entry;
0109 
0110     QString xmp = xmpHandler.extractXMP(ref->fileName());
0111     //  myDebug() << xmp;
0112     if(xmp.isEmpty()) {
0113       setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
0114     } else {
0115       setStatusMessage(QString());
0116 #if 0
0117       myWarning() << "Remove debug from pdfimporter.cpp";
0118       QFile f(QString::fromLatin1("/tmp/test-xmp.xml"));
0119       if(f.open(QIODevice::WriteOnly)) {
0120         QTextStream t(&f);
0121         t.setCodec("UTF-8");
0122         t << xmp;
0123       }
0124       f.close();
0125 #endif
0126       Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp));
0127       newColl = importer.collection();
0128       if(!newColl || newColl->entryCount() == 0) {
0129         myWarning() << "no collection found";
0130         setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
0131       } else {
0132         entry = newColl->entries().front();
0133         hasDOI |= !entry->field(QStringLiteral("doi")).isEmpty();
0134         // the XMP handler has a habit of inserting empty values surrounded by parentheses
0135         QRegularExpression rx(QLatin1String("^\\(\\s*\\)$"));
0136         foreach(Data::FieldPtr field, newColl->fields()) {
0137           QString value = entry->field(field);
0138           if(value.contains(rx)) {
0139             entry->setField(field, QString());
0140           }
0141         }
0142       }
0143     }
0144 
0145     if(!newColl) {
0146       newColl = new Data::BibtexCollection(true);
0147     }
0148     if(!entry) {
0149       entry = new Data::Entry(newColl);
0150       newColl->addEntries(entry);
0151     }
0152 
0153 #ifdef HAVE_POPPLER
0154 
0155     // now load from poppler
0156     Poppler::Document* doc = Poppler::Document::load(ref->fileName());
0157     if(doc && !doc->isLocked()) {
0158       // now the question is, do we overwrite XMP data with Poppler data?
0159       // for now, let's say yes conditionally
0160       QString s = doc->info(QStringLiteral("Title")).simplified();
0161       if(!s.isEmpty()) {
0162         entry->setField(QStringLiteral("title"), s);
0163       }
0164       // author could be separated by commas, "and" or whatever
0165       // we're not going to overwrite it
0166       if(entry->field(QStringLiteral("author")).isEmpty()) {
0167         QRegularExpression rx(QLatin1String("\\s*(\\s+and\\s+|,|;)\\s*"));
0168         QStringList authors = doc->info(QStringLiteral("Author")).simplified().split(rx);
0169         entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString()));
0170       }
0171       s = doc->info(QStringLiteral("Keywords")).simplified();
0172       if(!s.isEmpty()) {
0173         // keywords are also separated by semi-colons in poppler
0174         entry->setField(QStringLiteral("keyword"), s);
0175       }
0176 
0177       // now parse the first page text and try to guess
0178       Poppler::Page* page = doc->page(0);
0179       if(page) {
0180         // a null rectangle means get all text on page
0181         QString text = page->text(QRectF());
0182         // borrowed from Referencer
0183         QRegularExpression rx(QLatin1String("(?:"
0184                                             "(?:[Dd][Oo][Ii]:? *)"
0185                                             "|"
0186                                             "(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)"
0187                                             ")"
0188                                             "("
0189                                             "[^\\.\\s]+"
0190                                             "\\."
0191                                             "[^\\/\\s]+"
0192                                             "\\/"
0193                                             "[^\\s]+"
0194                                             ")"));
0195         QRegularExpressionMatch m = rx.match(text);
0196         if(m.hasMatch()) {
0197           QString doi = m.captured(1);
0198           myLog() << "in PDF file, found DOI:" << doi;
0199           entry->setField(QStringLiteral("doi"), doi);
0200           hasDOI = true;
0201         }
0202         rx = QRegularExpression(QLatin1String("arXiv:"
0203                                               "("
0204                                               "[^\\/\\s]+"
0205                                               "[\\/\\.]"
0206                                               "[^\\s]+"
0207                                               ")"));
0208         m = rx.match(text);
0209         if(m.hasMatch()) {
0210           QString arxiv = m.captured(1);
0211           myLog() << "in PDF file, found arxiv:" << arxiv;
0212           if(!entry->collection()->hasField(QStringLiteral("arxiv"))) {
0213             Data::FieldPtr field(new Data::Field(QStringLiteral("arxiv"), i18n("arXiv ID")));
0214             field->setCategory(i18n("Publishing"));
0215             entry->collection()->addField(field);
0216           }
0217           entry->setField(QStringLiteral("arxiv"), arxiv);
0218           hasArxiv = true;
0219         }
0220 
0221         delete page;
0222       }
0223     } else {
0224       myDebug() << "unable to read PDF info (poppler)";
0225     }
0226     delete doc;
0227 #endif
0228 
0229     entry->setField(QStringLiteral("url"), (*it).url());
0230     // always an article?
0231     entry->setField(QStringLiteral("entry-type"), QStringLiteral("article"));
0232 
0233     QPixmap pix = NetAccess::filePreview(QUrl::fromLocalFile(ref->fileName()), PDF_FILE_PREVIEW_SIZE);
0234     if(pix.isNull()) {
0235       myDebug() << "No file preview from pdf";
0236     } else {
0237       // is png best option?
0238       QString id = ImageFactory::addImage(pix, QStringLiteral("PNG"));
0239       if(!id.isEmpty()) {
0240         Data::FieldPtr field = newColl->fieldByName(QStringLiteral("cover"));
0241         if(!field && !newColl->imageFields().isEmpty()) {
0242           field = newColl->imageFields().front();
0243         } else if(!field) {
0244           field = new Data::Field(QStringLiteral("cover"), i18n("Front Cover"), Data::Field::Image);
0245           newColl->addField(field);
0246         }
0247         entry->setField(field, id);
0248       }
0249     }
0250     if(coll) {
0251       coll->addEntries(newColl->entries());
0252     } else {
0253       coll = newColl;
0254     }
0255 
0256     if(showProgress) {
0257       ProgressManager::self()->setProgress(this, j+1);
0258       qApp->processEvents();
0259     }
0260   }
0261 
0262   if(m_cancelled) {
0263     return Data::CollPtr();
0264   }
0265 
0266   if(hasDOI) {
0267     Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI);
0268     if(vec.isEmpty() && GUI::Proxy::widget()) {
0269       GUI::CursorSaver cs(Qt::ArrowCursor);
0270       KMessageBox::information(GUI::Proxy::widget(),
0271                               i18n("Tellico is able to download information about entries with a DOI from "
0272                                    "CrossRef.org. However, you must create an CrossRef account and add a new "
0273                                    "data source with your account information."),
0274                               QString(),
0275                               QStringLiteral("CrossRefSourceNeeded"));
0276     } else {
0277       foreach(Fetch::Fetcher::Ptr fetcher, vec) {
0278         foreach(Data::EntryPtr entry, coll->entries()) {
0279           KJob* job = new EntryUpdateJob(this, entry, fetcher);
0280           job->exec();
0281         }
0282       }
0283     }
0284   }
0285 
0286   if(m_cancelled) {
0287     return Data::CollPtr();
0288   }
0289 
0290   if(hasArxiv) {
0291     Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID);
0292     foreach(Fetch::Fetcher::Ptr fetcher, vec) {
0293       foreach(Data::EntryPtr entry, coll->entries()) {
0294         KJob* job = new EntryUpdateJob(this, entry, fetcher);
0295         job->exec();
0296       }
0297     }
0298   }
0299 
0300   // finally
0301   foreach(Data::EntryPtr entry, coll->entries()) {
0302     if(entry->title().isEmpty()) {
0303       // use file name
0304       QUrl u = QUrl::fromLocalFile(entry->field(QStringLiteral("url")));
0305       entry->setField(QStringLiteral("title"), u.fileName());
0306     }
0307   }
0308 
0309   if(m_cancelled) {
0310     return Data::CollPtr();
0311   }
0312   return coll;
0313 }
0314 
0315 void PDFImporter::slotCancel() {
0316   m_cancelled = true;
0317 }