File indexing completed on 2024-05-05 05:00:05

0001 /*
0002     SPDX-FileCopyrightText: 2001 Malte Starostik <malte@kde.org>
0003     SPDX-FileCopyrightText: 2020 Jonathan Marten <jjm@keelhaul.me.uk>
0004 
0005     SPDX-License-Identifier: GPL-2.0-or-later
0006 */
0007 
0008 #include "webarchivecreator.h"
0009 
0010 #include <QDebug>
0011 #include <QPixmap>
0012 #include <QImage>
0013 #include <QApplication>
0014 #include <QUrl>
0015 #include <QTimer>
0016 #include <QMimeType>
0017 #include <QMimeDatabase>
0018 #include <QTemporaryDir>
0019 
0020 #ifdef THUMBNAIL_USE_WEBKIT
0021 #include <qwebview.h>
0022 #include <qwebpage.h>
0023 #include <qwebsettings.h>
0024 #include <QNetworkCookie>
0025 #else // THUMBNAIL_USE_WEBKIT
0026 #include <QWebEngineView>
0027 #include <QWebEnginePage>
0028 #include <QWebEngineProfile>
0029 #include <QWebEngineSettings>
0030 #include <QWebEngineCookieStore>
0031 #endif // THUMBNAIL_USE_WEBKIT
0032 
0033 #include <ktar.h>
0034 #include <karchivedirectory.h>
0035 
0036 #include "webarchiverdebug.h"
0037 
0038 
0039 #undef SHOW_RENDER_WINDOW
0040 
0041 //TODO KF6: remove all instances of THUMBNAIL_USE_WEBKIT
0042 
0043 // This is an time limit for the entire thumbnail generation process
0044 // (page loading and rendering).  If it expires then it is assumed
0045 // that there is a problem and no thumbnail is generated.
0046 static const int c_completionTimeout = 5000;
0047 
0048 // After the page is loaded, the rendering happens in the background
0049 // with no way to find out when it has finished.  So this timer sets a
0050 // reasonable time for that to happen, when it expires the thumbnail
0051 // image is generated.
0052 static const int c_renderTimeout = 500;
0053 
0054 // The size of the pixmap onto which the rendered page is drawn, and
0055 // the rendering scale for the web page.  These settings have nothing
0056 // to do with the size of the pixmap requested when create() is called,
0057 // they are chosen for a reasonable rendering of the page (which should
0058 // work at an effective width of 800 pixels).  For the scale factor,
0059 // 0.25 is the minimum allowed by Qt.
0060 static const QSize c_pixmapSize = QSize(400, 600);
0061 static const double c_renderScale = 0.5;
0062 
0063 
0064 extern "C"
0065 {
0066     Q_DECL_EXPORT KIO::ThumbnailCreator *new_creator()
0067     {
0068         return (new WebArchiveCreator{nullptr, {}});
0069     }
0070 }
0071 
0072 WebArchiveCreator::WebArchiveCreator(QObject *parent, const QVariantList &va)
0073     : KIO::ThumbnailCreator(parent, va)
0074 {
0075     m_tempDir = nullptr;
0076 }
0077 
0078 
0079 WebArchiveCreator::~WebArchiveCreator()
0080 {
0081     delete m_tempDir;
0082 }
0083 
0084 
0085 #ifndef THUMBNAIL_USE_WEBKIT
0086 static bool disallowWebEngineCookies(const QWebEngineCookieStore::FilterRequest &req)
0087 {
0088     return (false);
0089 }
0090 #endif // THUMBNAIL_USE_WEBKIT
0091 
0092 KIO::ThumbnailResult WebArchiveCreator::create(const KIO::ThumbnailRequest& request)
0093 {
0094     // QImage img;
0095     // bool success = create(request.url().path(), request.targetSize().width(), request.targetSize().height(), img);
0096     // return success ? KIO::ThumbnailResult::pass(img) : KIO::ThumbnailResult::fail();
0097     QString path = request.url().path();
0098     int width = request.targetSize().width();
0099     int height = request.targetSize().height();
0100 
0101     QMimeDatabase db;
0102     // Only use the file path to look up its MIME type.  Web archives are
0103     // gzip-compressed tar files, so if the content detection has to be
0104     // used it may report that.  So a web archive file must have the correct
0105     // file extension.
0106     QMimeType mimeType = db.mimeTypeForFile(path, QMimeDatabase::MatchExtension);
0107 
0108     qCDebug(WEBARCHIVERPLUGIN_LOG) << "path" << path;
0109     qCDebug(WEBARCHIVERPLUGIN_LOG) << "wh" << width << height << "mime" << mimeType.name();
0110 
0111     // We are using QWebEngine here directly, not the WebEnginePart KPart.
0112     // This means that it will only be able to use the network access methods
0113     // that it supports internally, effectively 'file' and 'http(s)'.  In particular
0114     // it does not support any other KIO protocols, including 'tar' which would
0115     // be needed to look into web archives.  The WebEnginePart interfaces QWebEngine
0116     // to KIO.
0117     //
0118     // One option would be to do the same, i.e. to implement a network access handler
0119     // or a URL scheme handler that forwards requests to KIO.  However, the random
0120     // and possible repeated access to the page elements required would mean lots
0121     // of seeking around in the compressed web archive file.  Therefore, the web
0122     // archive is first extracted into a temporary directory and then QWebEngine
0123     // is told to render that.
0124 
0125     QString indexFile = path;               // the main page to render
0126 
0127     if (mimeType.inherits("application/x-webarchive"))  // archive needs to be extracted?
0128     {
0129         KTar tar(path);                 // auto-detects compression type
0130         tar.open(QIODevice::ReadOnly);
0131         const KArchiveDirectory *archiveDir = tar.directory();
0132 
0133         m_tempDir = new QTemporaryDir;
0134         const QString tempPath = m_tempDir->path();
0135         if (path.isEmpty())
0136         {
0137             qCWarning(WEBARCHIVERPLUGIN_LOG) << "Cannot create temporary directory";
0138             return (KIO::ThumbnailResult::fail());
0139         }
0140 
0141         qCDebug(WEBARCHIVERPLUGIN_LOG) << "extracting to tempPath" << tempPath;
0142         archiveDir->copyTo(tempPath, true);     // recursive extract from archive
0143         tar.close();                    // finished with archive file
0144 
0145         const QDir tempDir(tempPath);
0146         const QStringList entries = tempDir.entryList(QDir::Files|QDir::QDir::NoDotAndDotDot);
0147         qCDebug(WEBARCHIVERPLUGIN_LOG) << "found" << entries.count() << "entries";
0148 
0149         QString indexHtml;
0150         for (const QString &name : entries)
0151         {
0152             // Look though the extracted archive files to try to identify the
0153             // HTML page is to be rendered.  If "index.html" or "index.htm" is
0154             // found, that file is used;  otherwise, the first HTML file that
0155             // was found is used.
0156             const QMimeType mime = db.mimeTypeForFile(tempDir.absoluteFilePath(name), QMimeDatabase::MatchExtension);
0157             if (mime.inherits("text/html"))
0158             {
0159                 if (name.startsWith("index.", Qt::CaseInsensitive))
0160                 {                   // the index HTML file
0161                     indexHtml = name;
0162                     break;              // no need to look further
0163                 }
0164                 else if (indexHtml.isEmpty())       // any other HTML file
0165                 {
0166                     indexHtml = name;
0167                 }
0168             }
0169         }
0170 
0171         if (indexHtml.isEmpty())
0172         {
0173             qCWarning(WEBARCHIVERPLUGIN_LOG) << "No HTML file found in archive";
0174             return (KIO::ThumbnailResult::fail());
0175         }
0176 
0177         qCDebug(WEBARCHIVERPLUGIN_LOG) << "identified index file" << indexHtml;
0178         indexFile = tempPath+'/'+indexHtml;
0179     }
0180 
0181     const QUrl indexUrl = QUrl::fromLocalFile(indexFile);
0182     qCDebug(WEBARCHIVERPLUGIN_LOG) << "indexUrl" << indexUrl;
0183 
0184 #ifdef THUMBNAIL_USE_WEBKIT
0185     QWebView view;
0186     connect(&view, &QWebView::loadFinished, this, &WebArchiveCreator::slotLoadFinished);
0187 
0188     QWebSettings *settings = view.settings();
0189     settings->setThirdPartyCookiePolicy(QWebSettings::AlwaysBlockThirdPartyCookies);
0190     settings->setAttribute(QWebSettings::LocalContentCanAccessRemoteUrls, false);
0191     settings->setAttribute(QWebSettings::LocalContentCanAccessFileUrls, true);
0192     settings->setAttribute(QWebSettings::ZoomTextOnly, false);
0193     settings->setAttribute(QWebSettings::PrivateBrowsingEnabled, true);
0194     settings->setAttribute(QWebSettings::NotificationsEnabled, false);
0195     settings->setAttribute(QWebSettings::JavascriptEnabled, false);
0196     settings->setAttribute(QWebSettings::JavaEnabled, false);
0197     settings->setAttribute(QWebSettings::LocalStorageEnabled, false);
0198     settings->setAttribute(QWebSettings::LocalContentCanAccessRemoteUrls, false);
0199     settings->setAttribute(QWebSettings::PluginsEnabled, false);
0200     settings->setAttribute(QWebSettings::AllowRunningInsecureContent, false);
0201     settings->setAttribute(QWebSettings::PrintElementBackgrounds, true);
0202     settings->setAttribute(QWebSettings::PrivateBrowsingEnabled, true);
0203 
0204     QWebPage *page = view.page();
0205     auto *cookieJar = new WebArchiveCreatorCookieJar;
0206     page->networkAccessManager()->setCookieJar(cookieJar);
0207 #else // THUMBNAIL_USE_WEBKIT
0208     QWebEngineView view;
0209     connect(&view, &QWebEngineView::loadFinished, this, &WebArchiveCreator::slotLoadFinished);
0210 
0211     QWebEngineSettings *settings = view.settings();
0212     settings->setUnknownUrlSchemePolicy(QWebEngineSettings::DisallowUnknownUrlSchemes);
0213     settings->setAttribute(QWebEngineSettings::JavascriptEnabled, false);
0214     settings->setAttribute(QWebEngineSettings::LocalStorageEnabled, false);
0215     settings->setAttribute(QWebEngineSettings::LocalContentCanAccessRemoteUrls, false);
0216     settings->setAttribute(QWebEngineSettings::PluginsEnabled, false);
0217     settings->setAttribute(QWebEngineSettings::AutoLoadIconsForPage, false);
0218     settings->setAttribute(QWebEngineSettings::AllowRunningInsecureContent, false);
0219     settings->setAttribute(QWebEngineSettings::ShowScrollBars, false);
0220     settings->setAttribute(QWebEngineSettings::PdfViewerEnabled, false);
0221     settings->setAttribute(QWebEngineSettings::PrintElementBackgrounds, true);
0222 
0223     QWebEnginePage *page = view.page();
0224     QWebEngineProfile *profile = page->profile();
0225     profile->setPersistentCookiesPolicy(QWebEngineProfile::NoPersistentCookies);
0226     profile->setSpellCheckEnabled(false);
0227     profile->cookieStore()->setCookieFilter(&disallowWebEngineCookies);
0228 #endif // THUMBNAIL_USE_WEBKIT
0229 
0230     view.resize(c_pixmapSize);
0231     view.setZoomFactor(c_renderScale);              // 0.25 is the minimum allowed
0232 
0233     m_error = false;
0234     m_rendered = false;
0235 
0236     view.load(indexUrl);
0237 #ifndef SHOW_RENDER_WINDOW
0238     view.setAttribute(Qt::WA_ShowWithoutActivating);
0239     view.setAttribute(Qt::WA_OutsideWSRange);
0240     view.setWindowFlags(view.windowFlags()|Qt::BypassWindowManagerHint|Qt::FramelessWindowHint);
0241     view.move(5000, 5000);
0242 #endif
0243     view.show();
0244 
0245     QTimer::singleShot(c_completionTimeout, this, &WebArchiveCreator::slotProcessingTimeout);
0246     while (!m_error && !m_rendered) qApp->processEvents(QEventLoop::WaitForMoreEvents);
0247     qCDebug(WEBARCHIVERPLUGIN_LOG) << "finished loop error?" << m_error;
0248     if (m_error) return (KIO::ThumbnailResult::fail());         // load error or timeout
0249 
0250     // Render the HTML page on a bigger pixmap and leave the scaling to the
0251     // caller.  Looks better than directly scaling with the QPainter (malte).
0252     QSize pixSize = c_pixmapSize;
0253     if (pixSize.width()<width || pixSize.height()<height)
0254     {                           // default size is too small
0255         if ((height*3)>(width*4)) pixSize = QSize(width, (width*4)/3);
0256         else pixSize = QSize((height*3)/4, height);
0257     }
0258 
0259     QPixmap pix(pixSize);
0260     // First fill the pixmap with a light grey background, in case the
0261     // rendered page does not completely cover it.  If there was an error
0262     // then we will already have given up above.
0263     pix.fill(QColor(245, 245, 245));
0264 
0265     view.render(&pix);                  // render the view into the pixmap
0266     view.hide();                    // finished with the view and page
0267 #ifdef THUMBNAIL_USE_WEBKIT
0268     page->setVisibilityState(QWebPage::VisibilityStateHidden);
0269 #else // THUMBNAIL_USE_WEBKIT
0270 
0271 #if QT_VERSION >= QT_VERSION_CHECK(5, 14, 0)
0272     page->setLifecycleState(QWebEnginePage::LifecycleState::Discarded);
0273 #endif // QT_VERSION
0274 #endif // THUMBNAIL_USE_WEBKIT
0275 
0276     return KIO::ThumbnailResult::pass(pix.toImage());
0277 }
0278 
0279 void WebArchiveCreator::slotLoadFinished(bool ok)
0280 {
0281     qCDebug(WEBARCHIVERPLUGIN_LOG) << "ok?" << ok;
0282     if (!ok)
0283     {
0284         // If WebKit is being used, it is possible that 'ok' can be false
0285         // here even if the page load succeeded but it could only be
0286         // partially rendered (for example, a broken image source link).
0287         // Ignore the error indication and render the page anyway.
0288 #ifndef THUMBNAIL_USE_WEBKIT
0289         m_error = true;
0290         return;
0291 #endif // THUMBNAIL_USE_WEBKIT
0292     }
0293 
0294 #ifdef THUMBNAIL_USE_WEBKIT
0295     // WebKit will have finished rendering when the loadFinished() signal has been
0296     // delivered.  Render the bitmap immediately.
0297     slotRenderTimer();
0298 #else // THUMBNAIL_USE_WEBKIT
0299     // WebEngine renders asynchronously after the loadFinished() signal has been
0300     // delivered.  It is not possible to tell when page rendering has finished, so
0301     // a timer is used and the page is assumed to be ready when it expires.
0302     QTimer::singleShot(c_renderTimeout, this, &WebArchiveCreator::slotRenderTimer);
0303 #endif // THUMBNAIL_USE_WEBKIT
0304 }
0305 
0306 
0307 void WebArchiveCreator::slotProcessingTimeout()
0308 {
0309     m_error = true;
0310 }
0311 
0312 
0313 void WebArchiveCreator::slotRenderTimer()
0314 {
0315     m_rendered = true;
0316 }
0317 
0318 
0319 #ifdef THUMBNAIL_USE_WEBKIT
0320 
0321 // WebArchiveCreatorCookieJar
0322 //
0323 // A cookie jar that ignores any cookies sent to it and never
0324 // delivers any.
0325 
0326 WebArchiveCreatorCookieJar::WebArchiveCreatorCookieJar(QObject *parent)
0327     : QNetworkCookieJar(parent)
0328 {
0329 }
0330 
0331 QList<QNetworkCookie> WebArchiveCreatorCookieJar::cookiesForUrl(const QUrl &url) const
0332 {
0333     return (QList<QNetworkCookie>());
0334 }
0335 
0336 bool WebArchiveCreatorCookieJar::insertCookie(const QNetworkCookie &cookie)
0337 {
0338     return (false);
0339 }
0340 
0341 
0342 bool WebArchiveCreatorCookieJar::setCookiesFromUrl(const QList<QNetworkCookie> &cookieList, const QUrl &url)
0343 {
0344     return (false);
0345 }
0346 
0347 #endif // THUMBNAIL_USE_WEBKIT