File indexing completed on 2024-05-12 16:46:32
0001 /*************************************************************************** 0002 Copyright (C) 2007-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "pdfimporter.h" 0026 #include "tellicoimporter.h" 0027 #include "xslthandler.h" 0028 #include "xmphandler.h" 0029 #include "../collections/bibtexcollection.h" 0030 #include "../fieldformat.h" 0031 #include "../core/filehandler.h" 0032 #include "../core/netaccess.h" 0033 #include "../images/imagefactory.h" 0034 #include "../utils/guiproxy.h" 0035 #include "../fetch/fetchmanager.h" 0036 #include "../progressmanager.h" 0037 #include "../utils/cursorsaver.h" 0038 #include "../entryupdatejob.h" 0039 #include "../utils/datafileregistry.h" 0040 #include "../tellico_debug.h" 0041 0042 #include <KMessageBox> 0043 #include <KLocalizedString> 0044 0045 #include <QString> 0046 #include <QPixmap> 0047 #include <QApplication> 0048 #include <QFile> 0049 #include <QScopedPointer> 0050 0051 #include <config.h> 0052 #ifdef HAVE_POPPLER 0053 #include <poppler-qt5.h> 0054 #endif 0055 0056 namespace { 0057 static const int PDF_FILE_PREVIEW_SIZE = 196; 0058 } 0059 0060 using Tellico::Import::PDFImporter; 0061 0062 PDFImporter::PDFImporter(const QUrl& url_) : Importer(url_), m_cancelled(false) { 0063 } 0064 0065 PDFImporter::PDFImporter(const QList<QUrl>& urls_) : Importer(urls_), m_cancelled(false) { 0066 } 0067 0068 bool PDFImporter::canImport(int type_) const { 0069 return type_ == Data::Collection::Bibtex; 0070 } 0071 0072 Tellico::Data::CollPtr PDFImporter::collection() { 0073 QString xsltFile = DataFileRegistry::self()->locate(QStringLiteral("xmp2tellico.xsl")); 0074 if(xsltFile.isEmpty()) { 0075 myWarning() << "can not locate xmp2tellico.xsl"; 0076 return Data::CollPtr(); 0077 } 0078 0079 ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true); 0080 item.setTotalSteps(urls().count()); 0081 connect(&item, &Tellico::ProgressItem::signalCancelled, this, &Tellico::Import::PDFImporter::slotCancel); 0082 ProgressItem::Done done(this); 0083 const bool showProgress = options() & ImportProgress; 0084 0085 QUrl u = QUrl::fromLocalFile(xsltFile); 0086 0087 XSLTHandler xsltHandler(u); 0088 if(!xsltHandler.isValid()) { 0089 myWarning() << "invalid xslt in xmp2tellico.xsl"; 0090 return Data::CollPtr(); 0091 } 0092 0093 bool hasDOI = false; 0094 bool hasArxiv = false; 0095 0096 uint j = 0; 0097 0098 Data::CollPtr coll; 0099 XMPHandler xmpHandler; 0100 QList<QUrl> list = urls(); 0101 for(QList<QUrl>::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) { 0102 const QScopedPointer<FileHandler::FileRef> ref(FileHandler::fileRef(*it)); 0103 if(!ref->isValid()) { 0104 continue; 0105 } 0106 0107 Data::CollPtr newColl; 0108 Data::EntryPtr entry; 0109 0110 QString xmp = xmpHandler.extractXMP(ref->fileName()); 0111 // myDebug() << xmp; 0112 if(xmp.isEmpty()) { 0113 setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); 0114 } else { 0115 setStatusMessage(QString()); 0116 #if 0 0117 myWarning() << "Remove debug from pdfimporter.cpp"; 0118 QFile f(QString::fromLatin1("/tmp/test-xmp.xml")); 0119 if(f.open(QIODevice::WriteOnly)) { 0120 QTextStream t(&f); 0121 t.setCodec("UTF-8"); 0122 t << xmp; 0123 } 0124 f.close(); 0125 #endif 0126 Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp)); 0127 newColl = importer.collection(); 0128 if(!newColl || newColl->entryCount() == 0) { 0129 myWarning() << "no collection found"; 0130 setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); 0131 } else { 0132 entry = newColl->entries().front(); 0133 hasDOI |= !entry->field(QStringLiteral("doi")).isEmpty(); 0134 // the XMP handler has a habit of inserting empty values surrounded by parentheses 0135 QRegularExpression rx(QLatin1String("^\\(\\s*\\)$")); 0136 foreach(Data::FieldPtr field, newColl->fields()) { 0137 QString value = entry->field(field); 0138 if(value.contains(rx)) { 0139 entry->setField(field, QString()); 0140 } 0141 } 0142 } 0143 } 0144 0145 if(!newColl) { 0146 newColl = new Data::BibtexCollection(true); 0147 } 0148 if(!entry) { 0149 entry = new Data::Entry(newColl); 0150 newColl->addEntries(entry); 0151 } 0152 0153 #ifdef HAVE_POPPLER 0154 0155 // now load from poppler 0156 Poppler::Document* doc = Poppler::Document::load(ref->fileName()); 0157 if(doc && !doc->isLocked()) { 0158 // now the question is, do we overwrite XMP data with Poppler data? 0159 // for now, let's say yes conditionally 0160 QString s = doc->info(QStringLiteral("Title")).simplified(); 0161 if(!s.isEmpty()) { 0162 entry->setField(QStringLiteral("title"), s); 0163 } 0164 // author could be separated by commas, "and" or whatever 0165 // we're not going to overwrite it 0166 if(entry->field(QStringLiteral("author")).isEmpty()) { 0167 QRegularExpression rx(QLatin1String("\\s*(\\s+and\\s+|,|;)\\s*")); 0168 QStringList authors = doc->info(QStringLiteral("Author")).simplified().split(rx); 0169 entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString())); 0170 } 0171 s = doc->info(QStringLiteral("Keywords")).simplified(); 0172 if(!s.isEmpty()) { 0173 // keywords are also separated by semi-colons in poppler 0174 entry->setField(QStringLiteral("keyword"), s); 0175 } 0176 0177 // now parse the first page text and try to guess 0178 Poppler::Page* page = doc->page(0); 0179 if(page) { 0180 // a null rectangle means get all text on page 0181 QString text = page->text(QRectF()); 0182 // borrowed from Referencer 0183 QRegularExpression rx(QLatin1String("(?:" 0184 "(?:[Dd][Oo][Ii]:? *)" 0185 "|" 0186 "(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)" 0187 ")" 0188 "(" 0189 "[^\\.\\s]+" 0190 "\\." 0191 "[^\\/\\s]+" 0192 "\\/" 0193 "[^\\s]+" 0194 ")")); 0195 QRegularExpressionMatch m = rx.match(text); 0196 if(m.hasMatch()) { 0197 QString doi = m.captured(1); 0198 myLog() << "in PDF file, found DOI:" << doi; 0199 entry->setField(QStringLiteral("doi"), doi); 0200 hasDOI = true; 0201 } 0202 rx = QRegularExpression(QLatin1String("arXiv:" 0203 "(" 0204 "[^\\/\\s]+" 0205 "[\\/\\.]" 0206 "[^\\s]+" 0207 ")")); 0208 m = rx.match(text); 0209 if(m.hasMatch()) { 0210 QString arxiv = m.captured(1); 0211 myLog() << "in PDF file, found arxiv:" << arxiv; 0212 if(!entry->collection()->hasField(QStringLiteral("arxiv"))) { 0213 Data::FieldPtr field(new Data::Field(QStringLiteral("arxiv"), i18n("arXiv ID"))); 0214 field->setCategory(i18n("Publishing")); 0215 entry->collection()->addField(field); 0216 } 0217 entry->setField(QStringLiteral("arxiv"), arxiv); 0218 hasArxiv = true; 0219 } 0220 0221 delete page; 0222 } 0223 } else { 0224 myDebug() << "unable to read PDF info (poppler)"; 0225 } 0226 delete doc; 0227 #endif 0228 0229 entry->setField(QStringLiteral("url"), (*it).url()); 0230 // always an article? 0231 entry->setField(QStringLiteral("entry-type"), QStringLiteral("article")); 0232 0233 QPixmap pix = NetAccess::filePreview(QUrl::fromLocalFile(ref->fileName()), PDF_FILE_PREVIEW_SIZE); 0234 if(pix.isNull()) { 0235 myDebug() << "No file preview from pdf"; 0236 } else { 0237 // is png best option? 0238 QString id = ImageFactory::addImage(pix, QStringLiteral("PNG")); 0239 if(!id.isEmpty()) { 0240 Data::FieldPtr field = newColl->fieldByName(QStringLiteral("cover")); 0241 if(!field && !newColl->imageFields().isEmpty()) { 0242 field = newColl->imageFields().front(); 0243 } else if(!field) { 0244 field = new Data::Field(QStringLiteral("cover"), i18n("Front Cover"), Data::Field::Image); 0245 newColl->addField(field); 0246 } 0247 entry->setField(field, id); 0248 } 0249 } 0250 if(coll) { 0251 coll->addEntries(newColl->entries()); 0252 } else { 0253 coll = newColl; 0254 } 0255 0256 if(showProgress) { 0257 ProgressManager::self()->setProgress(this, j+1); 0258 qApp->processEvents(); 0259 } 0260 } 0261 0262 if(m_cancelled) { 0263 return Data::CollPtr(); 0264 } 0265 0266 if(hasDOI) { 0267 Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI); 0268 if(vec.isEmpty() && GUI::Proxy::widget()) { 0269 GUI::CursorSaver cs(Qt::ArrowCursor); 0270 KMessageBox::information(GUI::Proxy::widget(), 0271 i18n("Tellico is able to download information about entries with a DOI from " 0272 "CrossRef.org. However, you must create an CrossRef account and add a new " 0273 "data source with your account information."), 0274 QString(), 0275 QStringLiteral("CrossRefSourceNeeded")); 0276 } else { 0277 foreach(Fetch::Fetcher::Ptr fetcher, vec) { 0278 foreach(Data::EntryPtr entry, coll->entries()) { 0279 KJob* job = new EntryUpdateJob(this, entry, fetcher); 0280 job->exec(); 0281 } 0282 } 0283 } 0284 } 0285 0286 if(m_cancelled) { 0287 return Data::CollPtr(); 0288 } 0289 0290 if(hasArxiv) { 0291 Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID); 0292 foreach(Fetch::Fetcher::Ptr fetcher, vec) { 0293 foreach(Data::EntryPtr entry, coll->entries()) { 0294 KJob* job = new EntryUpdateJob(this, entry, fetcher); 0295 job->exec(); 0296 } 0297 } 0298 } 0299 0300 // finally 0301 foreach(Data::EntryPtr entry, coll->entries()) { 0302 if(entry->title().isEmpty()) { 0303 // use file name 0304 QUrl u = QUrl::fromLocalFile(entry->field(QStringLiteral("url"))); 0305 entry->setField(QStringLiteral("title"), u.fileName()); 0306 } 0307 } 0308 0309 if(m_cancelled) { 0310 return Data::CollPtr(); 0311 } 0312 return coll; 0313 } 0314 0315 void PDFImporter::slotCancel() { 0316 m_cancelled = true; 0317 }