File indexing completed on 2024-12-29 04:51:01
0001 /* 0002 SPDX-FileCopyrightText: 2018-2021 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "pdfdocumentprocessor.h" 0008 0009 #include "barcodedocumentprocessorhelper.h" 0010 #include "genericpriceextractorhelper_p.h" 0011 0012 #include "pdf/pdfbarcodeutil_p.h" 0013 #include "text/nameoptimizer_p.h" 0014 0015 #include <KItinerary/BarcodeDecoder> 0016 #include <KItinerary/ExtractorDocumentNodeFactory> 0017 #include <KItinerary/ExtractorEngine> 0018 #include <KItinerary/ExtractorResult> 0019 #include <KItinerary/PdfDocument> 0020 0021 #include <QImage> 0022 #include <QJSEngine> 0023 0024 0025 using namespace KItinerary; 0026 0027 Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::PdfDocument>) 0028 0029 enum { 0030 MaxPageCount = 10, // maximum in the current test set is 6 0031 MaxFileSize = 10000000, // maximum in the current test set is ~9MB 0032 }; 0033 0034 PdfDocumentProcessor::PdfDocumentProcessor() = default; 0035 PdfDocumentProcessor::~PdfDocumentProcessor() = default; 0036 0037 bool PdfDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const 0038 { 0039 return PdfDocument::maybePdf(encodedData) || 0040 fileName.endsWith(QLatin1StringView(".pdf"), Qt::CaseInsensitive); 0041 } 0042 0043 static void applyContextDateTime(PdfDocument *pdf, ExtractorDocumentNode &node) 0044 { 0045 // ignore broken PDF times for Amadeus documents 0046 if (pdf->producer() == QLatin1StringView("Amadeus") && 0047 pdf->creationTime() == pdf->modificationTime() && 0048 pdf->creationTime().date().year() <= 2013) { 0049 return; 0050 } 0051 0052 auto dt = pdf->modificationTime(); 0053 if (!dt.isValid()) { 0054 dt = pdf->creationTime(); 0055 } 0056 if (dt.isValid() && dt.date().year() > 2000 && dt < QDateTime::currentDateTime()) { 0057 node.setContextDateTime(dt); 0058 } 0059 } 0060 0061 ExtractorDocumentNode PdfDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const 0062 { 0063 auto pdf = PdfDocument::fromData(encodedData); 0064 // stay away from documents that are atypically large for what we are looking for 0065 // that's just unnecessarily eating up resources 0066 if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) { 0067 delete pdf; 0068 return {}; 0069 } 0070 0071 ExtractorDocumentNode node; 0072 node.setContent<Internal::OwnedPtr<PdfDocument>>(pdf); 0073 applyContextDateTime(pdf, node); 0074 return node; 0075 } 0076 0077 ExtractorDocumentNode PdfDocumentProcessor::createNodeFromContent(const QVariant &decodedData) const 0078 { 0079 auto pdf = decodedData.value<PdfDocument*>(); 0080 // stay away from documents that are atypically large for what we are looking for 0081 // that's just unnecessarily eating up resources 0082 if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) { 0083 return {}; 0084 } 0085 0086 ExtractorDocumentNode node; 0087 node.setContent(pdf); 0088 applyContextDateTime(pdf, node); 0089 return node; 0090 } 0091 0092 void PdfDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const 0093 { 0094 const auto doc = node.content<PdfDocument*>(); 0095 0096 for (int i = 0; i < doc->pageCount(); ++i) { 0097 const auto page = doc->page(i); 0098 m_imageIds.clear(); 0099 0100 for (int j = 0; j < page.imageCount(); ++j) { 0101 auto img = page.image(j); 0102 img.setLoadingHints(PdfImage::AbortOnColorHint | PdfImage::ConvertToGrayscaleHint); // we only care about b/w-ish images for barcode detection 0103 if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) { 0104 continue; 0105 } 0106 0107 const auto barcodeHints = PdfBarcodeUtil::maybeBarcode(img, BarcodeDecoder::Any2D | BarcodeDecoder::Any1D); 0108 if (barcodeHints == BarcodeDecoder::None) { 0109 continue; 0110 } 0111 0112 const auto imgData = img.image(); 0113 if (imgData.isNull()) { // can happen due to AbortOnColorHint 0114 continue; 0115 } 0116 0117 auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage"); 0118 childNode.setLocation(i); 0119 node.appendChild(childNode); // TODO the old code de-duplicated repeated barcodes here - do we actually need that? 0120 if (img.hasObjectId()) { 0121 m_imageIds.insert(img.objectId()); 0122 } 0123 0124 // technically not our job to do this here rather than letting the image node processor handle this 0125 // but we have the output aspect ratio of the barcode only here, which gives better decoding hints 0126 if (BarcodeDocumentProcessorHelper::expandNode(imgData, barcodeHints, childNode, engine)) { 0127 continue; 0128 } 0129 0130 // if this failed, check if the image as a aspect-ratio distorting scale and try again with that 0131 if (img.hasAspectRatioTransform()) { 0132 BarcodeDocumentProcessorHelper::expandNode(img.applyAspectRatioTransform(imgData), barcodeHints, childNode, engine); 0133 } 0134 } 0135 0136 // handle full page raster images 0137 if ((engine->hints() & ExtractorEngine::ExtractFullPageRasterImages) && page.imageCount() == 1 && page.text().isEmpty()) { 0138 qDebug() << "full page raster image"; 0139 auto img = page.image(0); 0140 if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) { // already handled 0141 continue; 0142 } 0143 0144 img.setLoadingHints(PdfImage::NoHint); // don't abort on color 0145 const auto imgData = img.image(); 0146 if (imgData.isNull()) { 0147 continue; 0148 } 0149 0150 auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage"); 0151 childNode.setLocation(i); 0152 node.appendChild(childNode); 0153 if (img.hasObjectId()) { 0154 m_imageIds.insert(img.objectId()); 0155 } 0156 } 0157 } 0158 0159 // fallback node for implicit conversion to plain text 0160 auto fallback = engine->documentNodeFactory()->createNode(doc->text(), u"text/plain"); 0161 node.appendChild(fallback); 0162 } 0163 0164 void PdfDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const 0165 { 0166 // find the text node we can run the optimizer on 0167 if (node.childNodes().empty() || node.result().isEmpty()) { 0168 return; 0169 } 0170 const QString text = node.childNodes().back().content<QString>(); 0171 0172 // run name optimizer on all results 0173 QList<QVariant> result; 0174 const auto res = node.result().result(); 0175 result.reserve(res.size()); 0176 for (const auto &r : res) { 0177 result.push_back(NameOptimizer::optimizeNameRecursive(text, r)); 0178 } 0179 node.setResult(std::move(result)); 0180 0181 // look for price data, if we have chance of that being unambiguous 0182 const auto doc = node.content<PdfDocument*>(); 0183 if (node.result().size() == 1 || doc->pageCount() == 1) { 0184 GenericPriceExtractorHelper::postExtract(text, node); 0185 } 0186 } 0187 0188 QJSValue PdfDocumentProcessor::contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const 0189 { 0190 return engine->toScriptValue(node.content<PdfDocument*>()); 0191 } 0192 0193 void PdfDocumentProcessor::destroyNode(ExtractorDocumentNode &node) const 0194 { 0195 destroyIfOwned<PdfDocument>(node); 0196 }