File indexing completed on 2024-05-12 05:10:12
0001 /*************************************************************************** 0002 Copyright (C) 2007-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "pdfimporter.h" 0026 #include "tellicoimporter.h" 0027 #include "xslthandler.h" 0028 #include "xmphandler.h" 0029 #include "../collections/bookcollection.h" 0030 #include "../collections/bibtexcollection.h" 0031 #include "../fieldformat.h" 0032 #include "../core/filehandler.h" 0033 #include "../core/netaccess.h" 0034 #include "../images/imagefactory.h" 0035 #include "../utils/guiproxy.h" 0036 #include "../fetch/fetchmanager.h" 0037 #include "../progressmanager.h" 0038 #include "../utils/cursorsaver.h" 0039 #include "../entryupdatejob.h" 0040 #include "../utils/datafileregistry.h" 0041 #include "../tellico_debug.h" 0042 0043 #include <KMessageBox> 0044 #include <KLocalizedString> 0045 0046 #include <QString> 0047 #include <QPixmap> 0048 #include <QApplication> 0049 #include <QFile> 0050 #include <QScopedPointer> 0051 0052 #include <config.h> 0053 #ifdef HAVE_POPPLER 0054 #include <poppler-qt5.h> 0055 #endif 0056 0057 namespace { 0058 static const int PDF_FILE_PREVIEW_SIZE = 196; 0059 } 0060 0061 using Tellico::Import::PDFImporter; 0062 0063 PDFImporter::PDFImporter(const QUrl& url_) : Importer(url_), m_cancelled(false) { 0064 } 0065 0066 PDFImporter::PDFImporter(const QList<QUrl>& urls_) : Importer(urls_), m_cancelled(false) { 0067 } 0068 0069 bool PDFImporter::canImport(int type_) const { 0070 return type_ == Data::Collection::Book || type_ == Data::Collection::Bibtex; 0071 } 0072 0073 Tellico::Data::CollPtr PDFImporter::collection() { 0074 QString xsltFile = DataFileRegistry::self()->locate(QStringLiteral("xmp2tellico.xsl")); 0075 if(xsltFile.isEmpty()) { 0076 myWarning() << "can not locate xmp2tellico.xsl"; 0077 return Data::CollPtr(); 0078 } 0079 0080 ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true); 0081 item.setTotalSteps(urls().count()); 0082 connect(&item, &Tellico::ProgressItem::signalCancelled, this, &Tellico::Import::PDFImporter::slotCancel); 0083 ProgressItem::Done done(this); 0084 const bool showProgress = options() & ImportProgress; 0085 0086 QUrl u = QUrl::fromLocalFile(xsltFile); 0087 0088 XSLTHandler xsltHandler(u); 0089 if(!xsltHandler.isValid()) { 0090 myWarning() << "invalid xslt in xmp2tellico.xsl"; 0091 return Data::CollPtr(); 0092 } 0093 bool isBook = false; 0094 if(currentCollection() && currentCollection()->type() == Data::Collection::Book) { 0095 xsltHandler.addStringParam("ctype", "2"); // book if already existing 0096 isBook = true; 0097 } else { 0098 xsltHandler.addStringParam("ctype", "5"); // bibtex by default 0099 } 0100 0101 bool hasDOI = false; 0102 bool hasArxiv = false; 0103 0104 uint j = 0; 0105 0106 Data::CollPtr coll; 0107 XMPHandler xmpHandler; 0108 QList<QUrl> list = urls(); 0109 for(QList<QUrl>::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) { 0110 const QScopedPointer<FileHandler::FileRef> ref(FileHandler::fileRef(*it)); 0111 if(!ref->isValid()) { 0112 continue; 0113 } 0114 0115 Data::CollPtr newColl; 0116 Data::EntryPtr entry; 0117 0118 QString xmp = xmpHandler.extractXMP(ref->fileName()); 0119 // myDebug() << xmp; 0120 if(xmp.isEmpty()) { 0121 setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); 0122 } else { 0123 setStatusMessage(QString()); 0124 #if 0 0125 myWarning() << "Remove debug from pdfimporter.cpp"; 0126 QFile f(QString::fromLatin1("/tmp/test-xmp.xml")); 0127 if(f.open(QIODevice::WriteOnly)) { 0128 QTextStream t(&f); 0129 t.setCodec("UTF-8"); 0130 t << xmp; 0131 } 0132 f.close(); 0133 #endif 0134 Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp)); 0135 newColl = importer.collection(); 0136 if(!newColl || newColl->entryCount() == 0) { 0137 myWarning() << "no collection found"; 0138 setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); 0139 } else { 0140 entry = newColl->entries().front(); 0141 hasDOI |= !entry->field(QStringLiteral("doi")).isEmpty(); 0142 // the XMP handler has a habit of inserting empty values surrounded by parentheses 0143 static const QRegularExpression rx(QLatin1String("^\\(\\s*\\)$")); 0144 foreach(Data::FieldPtr field, newColl->fields()) { 0145 QString value = entry->field(field); 0146 if(value.contains(rx)) { 0147 entry->setField(field, QString()); 0148 } 0149 } 0150 } 0151 } 0152 0153 #ifdef HAVE_POPPLER 0154 if(!newColl) { 0155 if(isBook) { 0156 newColl = new Data::BookCollection(true); 0157 } else { 0158 newColl = new Data::BibtexCollection(true); 0159 } 0160 } 0161 if(!entry) { 0162 entry = new Data::Entry(newColl); 0163 newColl->addEntries(entry); 0164 } 0165 0166 // now load from poppler 0167 Poppler::Document* doc = Poppler::Document::load(ref->fileName()); 0168 if(doc && !doc->isLocked()) { 0169 // now the question is, do we overwrite XMP data with Poppler data? 0170 // for now, let's say yes conditionally 0171 QString s = doc->info(QStringLiteral("Title")).simplified(); 0172 if(!s.isEmpty()) { 0173 entry->setField(QStringLiteral("title"), s); 0174 } 0175 // author could be separated by commas, "and" or whatever 0176 // we're not going to overwrite it 0177 if(entry->field(QStringLiteral("author")).isEmpty()) { 0178 static const QRegularExpression rx(QLatin1String("\\s*(\\s+and\\s+|,|;)\\s*")); 0179 QStringList authors = doc->info(QStringLiteral("Author")).simplified().split(rx); 0180 entry->setField(QStringLiteral("author"), authors.join(FieldFormat::delimiterString())); 0181 } 0182 s = doc->info(QStringLiteral("Keywords")).simplified(); 0183 if(s.isEmpty()) { 0184 s = doc->info(QStringLiteral("Subject")).simplified(); 0185 } 0186 if(!s.isEmpty()) { 0187 // keywords are also separated by semi-colons in poppler 0188 entry->setField(QStringLiteral("keyword"), s); 0189 } 0190 0191 // now parse the first page text and try to guess 0192 Poppler::Page* page = doc->page(0); 0193 if(page) { 0194 // a null rectangle means get all text on page 0195 QString text = page->text(QRectF()); 0196 // borrowed from Referencer 0197 static const QRegularExpression doiRx(QLatin1String("(?:" 0198 "(?:[Dd][Oo][Ii]:? *)" 0199 "|" 0200 "(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)" 0201 ")" 0202 "(" 0203 "[^\\.\\s]+" 0204 "\\." 0205 "[^\\/\\s]+" 0206 "\\/" 0207 "[^\\s]+" 0208 ")")); 0209 QRegularExpressionMatch m = doiRx.match(text); 0210 if(!m.hasMatch()) { 0211 static const QRegularExpression doiUrlRx(QLatin1String("https?://(?:dx\\.)?doi\\.org/(10.\\d{4,9}/[-._;()/:a-zA-Z0-9]+)")); 0212 m = doiUrlRx.match(text); 0213 } 0214 if(m.hasMatch()) { 0215 const QString doi = m.captured(1); 0216 myLog() << "In PDF file, found DOI:" << doi; 0217 entry->setField(QStringLiteral("doi"), doi); 0218 hasDOI = true; 0219 } 0220 static const QRegularExpression arxivRx(QLatin1String("arXiv:" 0221 "(" 0222 "[^\\/\\s]+" 0223 "[\\/\\.]" 0224 "[^\\s]+" 0225 ")")); 0226 m = arxivRx.match(text); 0227 if(m.hasMatch()) { 0228 const QString arxiv = m.captured(1); 0229 myLog() << "in PDF file, found arxiv:" << arxiv; 0230 if(!entry->collection()->hasField(QStringLiteral("arxiv"))) { 0231 Data::FieldPtr field(new Data::Field(QStringLiteral("arxiv"), i18n("arXiv ID"))); 0232 field->setCategory(i18n("Publishing")); 0233 entry->collection()->addField(field); 0234 } 0235 entry->setField(QStringLiteral("arxiv"), arxiv); 0236 hasArxiv = true; 0237 } 0238 0239 delete page; 0240 } 0241 } else { 0242 myDebug() << "unable to read PDF info (poppler)"; 0243 } 0244 delete doc; 0245 #elif defined HAVE_KFILEMETADATA 0246 if(!newColl || newColl->entryCount() == 0) { 0247 myDebug() << "Reading with metadata"; 0248 EBookImporter imp(urls()); 0249 auto ebookColl = imp.collection(); 0250 if(ebookColl && ebookColl->type() == Data::Collection::Book && !isBook) { 0251 newColl = Data::BibtexCollection::convertBookCollection(ebookColl); 0252 } else { 0253 newColl = ebookColl; 0254 } 0255 if(newColl->entryCount() > 0) { 0256 entry = new Data::Entry(newColl); 0257 newColl->addEntries(entry); 0258 } else { 0259 entry = newColl->entries().front(); 0260 } 0261 } 0262 #else 0263 // only recourse is to create an empty collection 0264 if(!newColl) { 0265 if(isBook) { 0266 newColl = new Data::BookCollection(true); 0267 } else { 0268 newColl = new Data::BibtexCollection(true); 0269 } 0270 } 0271 if(!entry) { 0272 entry = new Data::Entry(newColl); 0273 newColl->addEntries(entry); 0274 } 0275 #endif 0276 0277 if(!isBook) { 0278 entry->setField(QStringLiteral("url"), (*it).url()); 0279 // always an article? 0280 entry->setField(QStringLiteral("entry-type"), QStringLiteral("article")); 0281 } 0282 QPixmap pix = NetAccess::filePreview(QUrl::fromLocalFile(ref->fileName()), PDF_FILE_PREVIEW_SIZE); 0283 if(pix.isNull()) { 0284 myDebug() << "No file preview from pdf"; 0285 } else { 0286 // is png best option? 0287 QString id = ImageFactory::addImage(pix, QStringLiteral("PNG")); 0288 if(!id.isEmpty()) { 0289 Data::FieldPtr field = newColl->fieldByName(QStringLiteral("cover")); 0290 if(!field && !newColl->imageFields().isEmpty()) { 0291 field = newColl->imageFields().front(); 0292 } else if(!field) { 0293 field = Data::Field::createDefaultField(Data::Field::FrontCoverField); 0294 newColl->addField(field); 0295 } 0296 entry->setField(field, id); 0297 } 0298 } 0299 if(coll) { 0300 coll->addEntries(newColl->entries()); 0301 } else { 0302 coll = newColl; 0303 } 0304 0305 if(showProgress) { 0306 ProgressManager::self()->setProgress(this, j+1); 0307 qApp->processEvents(); 0308 } 0309 } 0310 0311 if(m_cancelled || !coll) { 0312 return Data::CollPtr(); 0313 } 0314 0315 if(hasDOI) { 0316 Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI); 0317 if(vec.isEmpty() && GUI::Proxy::widget()) { 0318 GUI::CursorSaver cs(Qt::ArrowCursor); 0319 KMessageBox::information(GUI::Proxy::widget(), 0320 i18n("Tellico is able to download information about entries with a DOI from " 0321 "CrossRef.org. However, you must create an CrossRef account and add a new " 0322 "data source with your account information."), 0323 QString(), 0324 QStringLiteral("CrossRefSourceNeeded")); 0325 } else { 0326 foreach(Fetch::Fetcher::Ptr fetcher, vec) { 0327 foreach(Data::EntryPtr entry, coll->entries()) { 0328 KJob* job = new EntryUpdateJob(this, entry, fetcher); 0329 job->exec(); 0330 } 0331 } 0332 } 0333 } 0334 0335 if(m_cancelled) { 0336 return Data::CollPtr(); 0337 } 0338 0339 if(hasArxiv) { 0340 Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID); 0341 foreach(Fetch::Fetcher::Ptr fetcher, vec) { 0342 foreach(Data::EntryPtr entry, coll->entries()) { 0343 KJob* job = new EntryUpdateJob(this, entry, fetcher); 0344 job->exec(); 0345 } 0346 } 0347 } 0348 0349 // finally 0350 foreach(Data::EntryPtr entry, coll->entries()) { 0351 if(entry->title().isEmpty()) { 0352 // use file name 0353 QUrl u = QUrl::fromLocalFile(entry->field(QStringLiteral("url"))); 0354 entry->setField(QStringLiteral("title"), u.fileName()); 0355 } 0356 } 0357 0358 if(m_cancelled) { 0359 return Data::CollPtr(); 0360 } 0361 return coll; 0362 } 0363 0364 void PDFImporter::slotCancel() { 0365 m_cancelled = true; 0366 }