File indexing completed on 2024-05-12 15:37:06

0001 /*
0002     SPDX-FileCopyrightText: 2012 Vishesh Handa <me@vhanda.in>
0003     SPDX-FileCopyrightText: 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-or-later
0006 */
0007 
0008 
0009 #include "popplerextractor.h"
0010 
0011 #include <QScopedPointer>
0012 #include <QDebug>
0013 #include <QDateTime>
0014 
0015 using namespace KFileMetaData;
0016 
0017 PopplerExtractor::PopplerExtractor(QObject* parent)
0018     : ExtractorPlugin(parent)
0019 {
0020 
0021 }
0022 
0023 const QStringList supportedMimeTypes = {
0024     QStringLiteral("application/pdf"),
0025 };
0026 
0027 QStringList PopplerExtractor::mimetypes() const
0028 {
0029     return supportedMimeTypes;
0030 }
0031 
0032 void PopplerExtractor::extract(ExtractionResult* result)
0033 {
0034     const QString fileUrl = result->inputUrl();
0035     std::unique_ptr<Poppler::Document> pdfDoc(Poppler::Document::load(fileUrl, QByteArray(), QByteArray()));
0036 
0037     if (!pdfDoc || pdfDoc->isLocked()) {
0038         return;
0039     }
0040 
0041     result->addType(Type::Document);
0042 
0043     if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
0044         const QString title = pdfDoc->title();
0045         if (!title.isEmpty()) {
0046             result->add(Property::Title, title);
0047         }
0048 
0049         const QString subject = pdfDoc->subject();
0050         if (!subject.isEmpty()) {
0051             result->add(Property::Subject, subject);
0052         }
0053 
0054         const QString author = pdfDoc->author();
0055         if (!author.isEmpty()) {
0056             result->add(Property::Author, author);
0057         }
0058 
0059         const QString generator = pdfDoc->producer();
0060         if (!generator.isEmpty()) {
0061             result->add(Property::Generator, generator);
0062         }
0063 
0064         const QDateTime creationDate = pdfDoc->creationDate();
0065         if (!creationDate.isNull()) {
0066             result->add(Property::CreationDate, creationDate);
0067         }
0068 
0069         const int numPages = pdfDoc->numPages();
0070         if (numPages > 0) {
0071             result->add(Property::PageCount, numPages);
0072         }
0073     }
0074 
0075     if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
0076         return;
0077     }
0078 
0079     for (int i = 0; i < pdfDoc->numPages(); i++) {
0080         std::unique_ptr<Poppler::Page> page(pdfDoc->page(i));
0081         if (!page) { // broken pdf files do not return a valid page
0082             qWarning() << "Could not read page content from" << fileUrl;
0083             break;
0084         }
0085         result->append(page->text(QRectF()));
0086     }
0087 }
0088 
0089 #include "moc_popplerextractor.cpp"