lib/processors/pdfdocumentprocessor.cpp

0001 /*
0002     SPDX-FileCopyrightText: 2018-2021 Volker Krause <vkrause@kde.org>
0003
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006
0007 #include "pdfdocumentprocessor.h"
0008
0009 #include "barcodedocumentprocessorhelper.h"
0010 #include "genericpriceextractorhelper_p.h"
0011
0012 #include "pdf/pdfbarcodeutil_p.h"
0013 #include "text/nameoptimizer_p.h"
0014
0015 #include <KItinerary/BarcodeDecoder>
0016 #include <KItinerary/ExtractorDocumentNodeFactory>
0017 #include <KItinerary/ExtractorEngine>
0018 #include <KItinerary/ExtractorResult>
0019 #include <KItinerary/PdfDocument>
0020
0021 #include <QImage>
0022 #include <QJSEngine>
0023
0024
0025 using namespace KItinerary;
0026
0027 Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::PdfDocument>)
0028
0029 enum {
0030     MaxPageCount = 10, // maximum in the current test set is 6
0031     MaxFileSize = 10000000, // maximum in the current test set is ~9MB
0032 };
0033
0034 PdfDocumentProcessor::PdfDocumentProcessor() = default;
0035 PdfDocumentProcessor::~PdfDocumentProcessor() = default;
0036
0037 bool PdfDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
0038 {
0039   return PdfDocument::maybePdf(encodedData) ||
0040          fileName.endsWith(QLatin1StringView(".pdf"), Qt::CaseInsensitive);
0041 }
0042
0043 static void applyContextDateTime(PdfDocument *pdf, ExtractorDocumentNode &node)
0044 {
0045     // ignore broken PDF times for Amadeus documents
0046     if (pdf->producer() == QLatin1StringView("Amadeus") &&
0047         pdf->creationTime() == pdf->modificationTime() &&
0048         pdf->creationTime().date().year() <= 2013) {
0049       return;
0050     }
0051
0052     auto dt = pdf->modificationTime();
0053     if (!dt.isValid()) {
0054         dt = pdf->creationTime();
0055     }
0056     if (dt.isValid() && dt.date().year() > 2000 && dt < QDateTime::currentDateTime()) {
0057         node.setContextDateTime(dt);
0058     }
0059 }
0060
0061 ExtractorDocumentNode PdfDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const
0062 {
0063     auto pdf = PdfDocument::fromData(encodedData);
0064     // stay away from documents that are atypically large for what we are looking for
0065     // that's just unnecessarily eating up resources
0066     if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) {
0067         delete pdf;
0068         return {};
0069     }
0070
0071     ExtractorDocumentNode node;
0072     node.setContent<Internal::OwnedPtr<PdfDocument>>(pdf);
0073     applyContextDateTime(pdf, node);
0074     return node;
0075 }
0076
0077 ExtractorDocumentNode PdfDocumentProcessor::createNodeFromContent(const QVariant &decodedData) const
0078 {
0079     auto pdf = decodedData.value<PdfDocument*>();
0080     // stay away from documents that are atypically large for what we are looking for
0081     // that's just unnecessarily eating up resources
0082     if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) {
0083         return {};
0084     }
0085
0086     ExtractorDocumentNode node;
0087     node.setContent(pdf);
0088     applyContextDateTime(pdf, node);
0089     return node;
0090 }
0091
0092 void PdfDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const
0093 {
0094     const auto doc = node.content<PdfDocument*>();
0095
0096     for (int i = 0; i < doc->pageCount(); ++i) {
0097         const auto page = doc->page(i);
0098         m_imageIds.clear();
0099
0100         for (int j = 0; j < page.imageCount(); ++j) {
0101             auto img = page.image(j);
0102             img.setLoadingHints(PdfImage::AbortOnColorHint | PdfImage::ConvertToGrayscaleHint); // we only care about b/w-ish images for barcode detection
0103             if (img.hasObjectId() &&  m_imageIds.find(img.objectId()) != m_imageIds.end()) {
0104                 continue;
0105             }
0106
0107             const auto barcodeHints = PdfBarcodeUtil::maybeBarcode(img, BarcodeDecoder::Any2D | BarcodeDecoder::Any1D);
0108             if (barcodeHints == BarcodeDecoder::None) {
0109                 continue;
0110             }
0111
0112             const auto imgData = img.image();
0113             if (imgData.isNull()) { // can happen due to AbortOnColorHint
0114                 continue;
0115             }
0116
0117             auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
0118             childNode.setLocation(i);
0119             node.appendChild(childNode); // TODO the old code de-duplicated repeated barcodes here - do we actually need that?
0120             if (img.hasObjectId()) {
0121                 m_imageIds.insert(img.objectId());
0122             }
0123
0124             // technically not our job to do this here rather than letting the image node processor handle this
0125             // but we have the output aspect ratio of the barcode only here, which gives better decoding hints
0126             if (BarcodeDocumentProcessorHelper::expandNode(imgData, barcodeHints, childNode, engine)) {
0127                 continue;
0128             }
0129
0130             // if this failed, check if the image as a aspect-ratio distorting scale and try again with that
0131             if (img.hasAspectRatioTransform()) {
0132                 BarcodeDocumentProcessorHelper::expandNode(img.applyAspectRatioTransform(imgData), barcodeHints, childNode, engine);
0133             }
0134         }
0135
0136         // handle full page raster images
0137         if ((engine->hints() & ExtractorEngine::ExtractFullPageRasterImages) && page.imageCount() == 1 && page.text().isEmpty()) {
0138             qDebug() << "full page raster image";
0139             auto img = page.image(0);
0140             if (img.hasObjectId() &&  m_imageIds.find(img.objectId()) != m_imageIds.end()) { // already handled
0141                 continue;
0142             }
0143
0144             img.setLoadingHints(PdfImage::NoHint); // don't abort on color
0145             const auto imgData = img.image();
0146             if (imgData.isNull()) {
0147                 continue;
0148             }
0149
0150             auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
0151             childNode.setLocation(i);
0152             node.appendChild(childNode);
0153             if (img.hasObjectId()) {
0154                 m_imageIds.insert(img.objectId());
0155             }
0156         }
0157     }
0158
0159     // fallback node for implicit conversion to plain text
0160     auto fallback = engine->documentNodeFactory()->createNode(doc->text(), u"text/plain");
0161     node.appendChild(fallback);
0162 }
0163
0164 void PdfDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
0165 {
0166     // find the text node we can run the optimizer on
0167     if (node.childNodes().empty() || node.result().isEmpty()) {
0168         return;
0169     }
0170     const QString text = node.childNodes().back().content<QString>();
0171
0172     // run name optimizer on all results
0173     QList<QVariant> result;
0174     const auto res = node.result().result();
0175     result.reserve(res.size());
0176     for (const auto &r : res) {
0177         result.push_back(NameOptimizer::optimizeNameRecursive(text, r));
0178     }
0179     node.setResult(std::move(result));
0180
0181     // look for price data, if we have chance of that being unambiguous
0182     const auto doc = node.content<PdfDocument*>();
0183     if (node.result().size() == 1 || doc->pageCount() == 1) {
0184         GenericPriceExtractorHelper::postExtract(text, node);
0185     }
0186 }
0187
0188 QJSValue PdfDocumentProcessor::contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const
0189 {
0190     return engine->toScriptValue(node.content<PdfDocument*>());
0191 }
0192
0193 void PdfDocumentProcessor::destroyNode(ExtractorDocumentNode &node) const
0194 {
0195     destroyIfOwned<PdfDocument>(node);
0196 }