File indexing completed on 2024-12-29 04:50:59

0001 /*
0002     SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
0003 
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "config-kitinerary.h"
0008 #include "pdfdocument.h"
0009 #include "pdfdocument_p.h"
0010 #include "pdfextractoroutputdevice_p.h"
0011 #include "pdfimage_p.h"
0012 #include "popplerglobalparams_p.h"
0013 #include "popplerutils_p.h"
0014 #include "logging.h"
0015 
0016 #include <QDebug>
0017 #include <QImage>
0018 #include <QScopedValueRollback>
0019 #include <QTimeZone>
0020 
0021 #include <DateInfo.h>
0022 #include <PDFDoc.h>
0023 #include <PDFDocEncoding.h>
0024 #include <Stream.h>
0025 
0026 #include <cmath>
0027 
0028 using namespace KItinerary;
0029 
0030 void PdfPagePrivate::load()
0031 {
0032     if (m_loaded) {
0033         return;
0034     }
0035 
0036     PopplerGlobalParams gp;
0037     PdfExtractorOutputDevice device;
0038     m_doc->m_popplerDoc->displayPageSlice(&device, m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1);
0039     m_doc->m_popplerDoc->processLinks(&device, m_pageNum + 1);
0040     device.finalize();
0041     const auto pageRect = m_doc->m_popplerDoc->getPage(m_pageNum + 1)->getCropBox();
0042     std::unique_ptr<GooString> s(device.getText(pageRect->x1, pageRect->y1, pageRect->x2, pageRect->y2));
0043 
0044 #if KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
0045     m_text = QString::fromUtf8(s->c_str());
0046 #else
0047     m_text = QString::fromUtf8(s->getCString());
0048 #endif
0049     m_images = std::move(device.m_images);
0050     for (auto it = m_images.begin(); it != m_images.end(); ++it) {
0051         (*it).d->m_page = this;
0052     }
0053 
0054     m_links = std::move(device.m_links);
0055     for (auto &link : m_links) {
0056         link.convertToPageRect(pageRect);
0057     }
0058 
0059     m_loaded = true;
0060 }
0061 
0062 PdfPage::PdfPage()
0063     : d(new PdfPagePrivate)
0064 {
0065 }
0066 
0067 PdfPage::PdfPage(const PdfPage&) = default;
0068 PdfPage::~PdfPage() = default;
0069 PdfPage& PdfPage::operator=(const PdfPage&) = default;
0070 
0071 QString PdfPage::text() const
0072 {
0073     d->load();
0074     return d->m_text;
0075 }
0076 
0077 static double ratio(double begin, double end, double ratio)
0078 {
0079     return begin + (end - begin) * ratio;
0080 }
0081 
0082 QString PdfPage::textInRect(double left, double top, double right, double bottom) const
0083 {
0084     PopplerGlobalParams gp;
0085 
0086     const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
0087     const auto pageRect = page->getCropBox();
0088 
0089     double l;
0090     double t;
0091     double r;
0092     double b;
0093     switch (page->getRotate()) {
0094         case 0:
0095             l = ratio(pageRect->x1, pageRect->x2, left);
0096             t = ratio(pageRect->y1, pageRect->y2, top);
0097             r = ratio(pageRect->x1, pageRect->x2, right);
0098             b = ratio(pageRect->y1, pageRect->y2, bottom);
0099             break;
0100         case 90:
0101             l = ratio(pageRect->y1, pageRect->y2, left);
0102             t = ratio(pageRect->x1, pageRect->x2, top);
0103             r = ratio(pageRect->y1, pageRect->y2, right);
0104             b = ratio(pageRect->x1, pageRect->x2, bottom);
0105             break;
0106         default:
0107             qCWarning(Log) << "Unsupported page rotation!" << page->getRotate();
0108             return {};
0109     }
0110 
0111     TextOutputDev device(nullptr, false, 0, false, false);
0112     d->m_doc->m_popplerDoc->displayPageSlice(&device, d->m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1);
0113     std::unique_ptr<GooString> s(device.getText(l, t, r, b));
0114 #if KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
0115     return QString::fromUtf8(s->c_str());
0116 #else
0117     return QString::fromUtf8(s->getCString());
0118 #endif
0119 }
0120 
0121 int PdfPage::imageCount() const
0122 {
0123     d->load();
0124     return d->m_images.size();
0125 }
0126 
0127 PdfImage PdfPage::image(int index) const
0128 {
0129     d->load();
0130     return d->m_images[index];
0131 }
0132 
0133 QVariantList PdfPage::imagesVariant() const
0134 {
0135     d->load();
0136     QVariantList l;
0137     l.reserve(imageCount());
0138     std::for_each(d->m_images.begin(), d->m_images.end(), [&l](const PdfImage& img) { l.push_back(QVariant::fromValue(img)); });
0139     return l;
0140 }
0141 
0142 QVariantList PdfPage::imagesInRect(double left, double top, double right, double bottom) const
0143 {
0144     d->load();
0145     QVariantList l;
0146     PopplerGlobalParams gp;
0147     const auto pageRect = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1)->getCropBox();
0148 
0149     for (const auto &img : d->m_images) {
0150         if ((img.d->m_transform.dx() >= ratio(pageRect->x1, pageRect->x2, left) && img.d->m_transform.dx() <= ratio(pageRect->x1, pageRect->x2, right)) &&
0151             (img.d->m_transform.dy() >= ratio(pageRect->y1, pageRect->y2, top)  && img.d->m_transform.dy() <= ratio(pageRect->y1, pageRect->y2, bottom)))
0152         {
0153             l.push_back(QVariant::fromValue(img));
0154         }
0155     }
0156     return l;
0157 }
0158 
0159 int PdfPage::linkCount() const
0160 {
0161     d->load();
0162     return d->m_links.size();
0163 }
0164 
0165 PdfLink PdfPage::link(int index) const
0166 {
0167     d->load();
0168     return d->m_links[index];
0169 }
0170 
0171 QVariantList PdfPage::linksVariant() const
0172 {
0173     d->load();
0174     QVariantList l;
0175     l.reserve(d->m_links.size());
0176     std::transform(d->m_links.begin(), d->m_links.end(), std::back_inserter(l), [](const PdfLink &link) { return QVariant::fromValue(link); });
0177     return l;
0178 }
0179 
0180 QVariantList PdfPage::linksInRect(double left, double top, double right, double bottom) const
0181 {
0182     QRectF bbox(QPointF(left, top), QPointF(right, bottom));
0183     d->load();
0184 
0185     QVariantList l;
0186     for (const auto &link : d->m_links) {
0187         if (!link.area().intersects(bbox)) {
0188             continue;
0189         }
0190         l.push_back(QVariant::fromValue(link));
0191     }
0192 
0193     std::sort(l.begin(), l.end(), [](const auto &lhs, const auto &rhs) {
0194         const auto lhsLink = lhs.template value<PdfLink>();
0195         const auto rhsLink = rhs.template value<PdfLink>();
0196         if (lhsLink.area().top() == rhsLink.area().top()) {
0197             return lhsLink.area().left() < rhsLink.area().left();
0198         }
0199         return lhsLink.area().top() < rhsLink.area().top();
0200     });
0201 
0202     return l;
0203 }
0204 
0205 static constexpr inline double pdfToMM(double points)
0206 {
0207     return points * 25.4 / 72.0;
0208 }
0209 
0210 int PdfPage::width() const
0211 {
0212     const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
0213     const auto rot = page->getRotate();
0214     if (rot == 90 || rot == 270) {
0215         return pdfToMM(page->getCropHeight());
0216     }
0217     return pdfToMM(page->getCropWidth());
0218 }
0219 
0220 int PdfPage::height() const
0221 {
0222     const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
0223     const auto rot = page->getRotate();
0224     if (rot == 90 || rot == 270) {
0225         return pdfToMM(page->getCropWidth());
0226     }
0227     return pdfToMM(page->getCropHeight());
0228 }
0229 
0230 
0231 PdfDocument::PdfDocument(QObject *parent)
0232     : QObject(parent)
0233     , d(new PdfDocumentPrivate)
0234 {
0235 }
0236 
0237 PdfDocument::~PdfDocument() = default;
0238 
0239 QString PdfDocument::text() const
0240 {
0241     QString text;
0242     std::for_each(d->m_pages.begin(), d->m_pages.end(), [&text](const PdfPage &p) { text += p.text(); });
0243     return text;
0244 }
0245 
0246 int PdfDocument::pageCount() const
0247 {
0248     return d->m_popplerDoc->getNumPages();
0249 }
0250 
0251 PdfPage PdfDocument::page(int index) const
0252 {
0253     return d->m_pages[index];
0254 }
0255 
0256 int PdfDocument::fileSize() const
0257 {
0258     return d->m_pdfData.size();
0259 }
0260 
0261 #if KPOPPLER_VERSION >= QT_VERSION_CHECK(21, 8, 0)
0262 static QDateTime parsePdfDateTime(const GooString *str)
0263 #else
0264 static QDateTime parsePdfDateTime(const char *str)
0265 #endif
0266 {
0267     int year;
0268     int month;
0269     int day;
0270     int hour;
0271     int min;
0272     int sec;
0273     int tzHours;
0274     int tzMins;
0275     char tz;
0276 
0277     if (!parseDateString(str, &year, &month, &day, &hour, &min, &sec, &tz, &tzHours, &tzMins)) {
0278         return {};
0279     }
0280 
0281     QDate date(year, month, day);
0282     QTime time(hour, min, sec);
0283     if (!date.isValid() || !time.isValid()) {
0284         return {};
0285     }
0286 
0287     int offset = tzHours * 3600 + tzMins * 60;
0288     if (tz == '+') {
0289         return QDateTime(date, time, QTimeZone::fromSecondsAheadOfUtc(offset));
0290     } else if (tz == '-') {
0291         return QDateTime(date, time, QTimeZone::fromSecondsAheadOfUtc(-offset));
0292     }
0293     return QDateTime(date, time, QTimeZone::UTC);
0294 }
0295 
0296 QDateTime PdfDocument::creationTime() const
0297 {
0298     std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoCreatDate());
0299     if (!dt) {
0300         return {};
0301     }
0302 #if KPOPPLER_VERSION >= QT_VERSION_CHECK(21, 8, 0)
0303     return parsePdfDateTime(dt.get());
0304 #elif KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
0305     return parsePdfDateTime(dt->c_str());
0306 #else
0307     return parsePdfDateTime(dt->getCString());
0308 #endif
0309 }
0310 
0311 QDateTime PdfDocument::modificationTime() const
0312 {
0313     std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoModDate());
0314     if (!dt) {
0315         return {};
0316     }
0317 #if KPOPPLER_VERSION >= QT_VERSION_CHECK(21, 8, 0)
0318     return parsePdfDateTime(dt.get());
0319 #elif KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
0320     return parsePdfDateTime(dt->c_str());
0321 #else
0322     return parsePdfDateTime(dt->getCString());
0323 #endif
0324 }
0325 
0326 
0327 QString gooStringToUnicode(const std::unique_ptr<GooString> &s)
0328 {
0329     if (!s) {
0330         return {};
0331     }
0332 
0333     if (s->hasUnicodeMarker() || s->hasUnicodeMarkerLE()) {
0334         return QString::fromUtf16(reinterpret_cast<const char16_t*>(s->toStr().c_str()), s->toStr().size() / 2);
0335     } else {
0336         int len = 0;
0337         std::unique_ptr<const char[]> utf16Data(pdfDocEncodingToUTF16(s->toStr(), &len));
0338         return QString::fromUtf16(reinterpret_cast<const char16_t*>(utf16Data.get()), len / 2);
0339     }
0340 
0341     return QString::fromUtf8(s->c_str());
0342 }
0343 
0344 QString PdfDocument::title() const
0345 {
0346     return gooStringToUnicode(d->m_popplerDoc->getDocInfoTitle());
0347 }
0348 
0349 QString PdfDocument::producer() const
0350 {
0351     return gooStringToUnicode(d->m_popplerDoc->getDocInfoProducer());
0352 }
0353 
0354 QString PdfDocument::creator() const
0355 {
0356     return gooStringToUnicode(d->m_popplerDoc->getDocInfoCreator());
0357 }
0358 
0359 QString PdfDocument::author() const
0360 {
0361     return gooStringToUnicode(d->m_popplerDoc->getDocInfoAuthor());
0362 }
0363 
0364 QVariantList PdfDocument::pagesVariant() const
0365 {
0366     QVariantList l;
0367     l.reserve(pageCount());
0368     std::for_each(d->m_pages.begin(), d->m_pages.end(), [&l](const PdfPage& p) { l.push_back(QVariant::fromValue(p)); });
0369     return l;
0370 }
0371 
0372 PdfDocument* PdfDocument::fromData(const QByteArray &data, QObject *parent)
0373 {
0374     PopplerGlobalParams gp;
0375 
0376     std::unique_ptr<PdfDocument> doc(new PdfDocument(parent));
0377     doc->d->m_pdfData = data;
0378     // PDFDoc takes ownership of stream
0379 #if KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 58, 0)
0380     auto stream = new MemStream(const_cast<char*>(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), Object());
0381 #else
0382     Object obj;
0383     obj.initNull();
0384     auto stream = new MemStream(const_cast<char*>(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), &obj);
0385 #endif
0386     std::unique_ptr<PDFDoc> popplerDoc(new PDFDoc(stream));
0387     if (!popplerDoc->isOk()) {
0388         qCWarning(Log) << "Got invalid PDF document!" << popplerDoc->getErrorCode();
0389         return nullptr;
0390     }
0391 
0392     doc->d->m_pages.reserve(popplerDoc->getNumPages());
0393     for (int i = 0; i < popplerDoc->getNumPages(); ++i) {
0394         PdfPage page;
0395         page.d->m_pageNum = i;
0396         page.d->m_doc = doc->d.get();
0397         doc->d->m_pages.push_back(page);
0398     }
0399 
0400     doc->d->m_popplerDoc = std::move(popplerDoc);
0401     return doc.release();
0402 }
0403 
0404 bool PdfDocument::maybePdf(const QByteArray &data)
0405 {
0406     return data.startsWith("%PDF");
0407 }
0408 
0409 #include "moc_pdfdocument.cpp"