File indexing completed on 2024-05-05 05:00:05
0001 /* 0002 SPDX-FileCopyrightText: 2001 Malte Starostik <malte@kde.org> 0003 SPDX-FileCopyrightText: 2020 Jonathan Marten <jjm@keelhaul.me.uk> 0004 0005 SPDX-License-Identifier: GPL-2.0-or-later 0006 */ 0007 0008 #include "webarchivecreator.h" 0009 0010 #include <QDebug> 0011 #include <QPixmap> 0012 #include <QImage> 0013 #include <QApplication> 0014 #include <QUrl> 0015 #include <QTimer> 0016 #include <QMimeType> 0017 #include <QMimeDatabase> 0018 #include <QTemporaryDir> 0019 0020 #ifdef THUMBNAIL_USE_WEBKIT 0021 #include <qwebview.h> 0022 #include <qwebpage.h> 0023 #include <qwebsettings.h> 0024 #include <QNetworkCookie> 0025 #else // THUMBNAIL_USE_WEBKIT 0026 #include <QWebEngineView> 0027 #include <QWebEnginePage> 0028 #include <QWebEngineProfile> 0029 #include <QWebEngineSettings> 0030 #include <QWebEngineCookieStore> 0031 #endif // THUMBNAIL_USE_WEBKIT 0032 0033 #include <ktar.h> 0034 #include <karchivedirectory.h> 0035 0036 #include "webarchiverdebug.h" 0037 0038 0039 #undef SHOW_RENDER_WINDOW 0040 0041 //TODO KF6: remove all instances of THUMBNAIL_USE_WEBKIT 0042 0043 // This is an time limit for the entire thumbnail generation process 0044 // (page loading and rendering). If it expires then it is assumed 0045 // that there is a problem and no thumbnail is generated. 0046 static const int c_completionTimeout = 5000; 0047 0048 // After the page is loaded, the rendering happens in the background 0049 // with no way to find out when it has finished. So this timer sets a 0050 // reasonable time for that to happen, when it expires the thumbnail 0051 // image is generated. 0052 static const int c_renderTimeout = 500; 0053 0054 // The size of the pixmap onto which the rendered page is drawn, and 0055 // the rendering scale for the web page. These settings have nothing 0056 // to do with the size of the pixmap requested when create() is called, 0057 // they are chosen for a reasonable rendering of the page (which should 0058 // work at an effective width of 800 pixels). For the scale factor, 0059 // 0.25 is the minimum allowed by Qt. 0060 static const QSize c_pixmapSize = QSize(400, 600); 0061 static const double c_renderScale = 0.5; 0062 0063 0064 extern "C" 0065 { 0066 Q_DECL_EXPORT KIO::ThumbnailCreator *new_creator() 0067 { 0068 return (new WebArchiveCreator{nullptr, {}}); 0069 } 0070 } 0071 0072 WebArchiveCreator::WebArchiveCreator(QObject *parent, const QVariantList &va) 0073 : KIO::ThumbnailCreator(parent, va) 0074 { 0075 m_tempDir = nullptr; 0076 } 0077 0078 0079 WebArchiveCreator::~WebArchiveCreator() 0080 { 0081 delete m_tempDir; 0082 } 0083 0084 0085 #ifndef THUMBNAIL_USE_WEBKIT 0086 static bool disallowWebEngineCookies(const QWebEngineCookieStore::FilterRequest &req) 0087 { 0088 return (false); 0089 } 0090 #endif // THUMBNAIL_USE_WEBKIT 0091 0092 KIO::ThumbnailResult WebArchiveCreator::create(const KIO::ThumbnailRequest& request) 0093 { 0094 // QImage img; 0095 // bool success = create(request.url().path(), request.targetSize().width(), request.targetSize().height(), img); 0096 // return success ? KIO::ThumbnailResult::pass(img) : KIO::ThumbnailResult::fail(); 0097 QString path = request.url().path(); 0098 int width = request.targetSize().width(); 0099 int height = request.targetSize().height(); 0100 0101 QMimeDatabase db; 0102 // Only use the file path to look up its MIME type. Web archives are 0103 // gzip-compressed tar files, so if the content detection has to be 0104 // used it may report that. So a web archive file must have the correct 0105 // file extension. 0106 QMimeType mimeType = db.mimeTypeForFile(path, QMimeDatabase::MatchExtension); 0107 0108 qCDebug(WEBARCHIVERPLUGIN_LOG) << "path" << path; 0109 qCDebug(WEBARCHIVERPLUGIN_LOG) << "wh" << width << height << "mime" << mimeType.name(); 0110 0111 // We are using QWebEngine here directly, not the WebEnginePart KPart. 0112 // This means that it will only be able to use the network access methods 0113 // that it supports internally, effectively 'file' and 'http(s)'. In particular 0114 // it does not support any other KIO protocols, including 'tar' which would 0115 // be needed to look into web archives. The WebEnginePart interfaces QWebEngine 0116 // to KIO. 0117 // 0118 // One option would be to do the same, i.e. to implement a network access handler 0119 // or a URL scheme handler that forwards requests to KIO. However, the random 0120 // and possible repeated access to the page elements required would mean lots 0121 // of seeking around in the compressed web archive file. Therefore, the web 0122 // archive is first extracted into a temporary directory and then QWebEngine 0123 // is told to render that. 0124 0125 QString indexFile = path; // the main page to render 0126 0127 if (mimeType.inherits("application/x-webarchive")) // archive needs to be extracted? 0128 { 0129 KTar tar(path); // auto-detects compression type 0130 tar.open(QIODevice::ReadOnly); 0131 const KArchiveDirectory *archiveDir = tar.directory(); 0132 0133 m_tempDir = new QTemporaryDir; 0134 const QString tempPath = m_tempDir->path(); 0135 if (path.isEmpty()) 0136 { 0137 qCWarning(WEBARCHIVERPLUGIN_LOG) << "Cannot create temporary directory"; 0138 return (KIO::ThumbnailResult::fail()); 0139 } 0140 0141 qCDebug(WEBARCHIVERPLUGIN_LOG) << "extracting to tempPath" << tempPath; 0142 archiveDir->copyTo(tempPath, true); // recursive extract from archive 0143 tar.close(); // finished with archive file 0144 0145 const QDir tempDir(tempPath); 0146 const QStringList entries = tempDir.entryList(QDir::Files|QDir::QDir::NoDotAndDotDot); 0147 qCDebug(WEBARCHIVERPLUGIN_LOG) << "found" << entries.count() << "entries"; 0148 0149 QString indexHtml; 0150 for (const QString &name : entries) 0151 { 0152 // Look though the extracted archive files to try to identify the 0153 // HTML page is to be rendered. If "index.html" or "index.htm" is 0154 // found, that file is used; otherwise, the first HTML file that 0155 // was found is used. 0156 const QMimeType mime = db.mimeTypeForFile(tempDir.absoluteFilePath(name), QMimeDatabase::MatchExtension); 0157 if (mime.inherits("text/html")) 0158 { 0159 if (name.startsWith("index.", Qt::CaseInsensitive)) 0160 { // the index HTML file 0161 indexHtml = name; 0162 break; // no need to look further 0163 } 0164 else if (indexHtml.isEmpty()) // any other HTML file 0165 { 0166 indexHtml = name; 0167 } 0168 } 0169 } 0170 0171 if (indexHtml.isEmpty()) 0172 { 0173 qCWarning(WEBARCHIVERPLUGIN_LOG) << "No HTML file found in archive"; 0174 return (KIO::ThumbnailResult::fail()); 0175 } 0176 0177 qCDebug(WEBARCHIVERPLUGIN_LOG) << "identified index file" << indexHtml; 0178 indexFile = tempPath+'/'+indexHtml; 0179 } 0180 0181 const QUrl indexUrl = QUrl::fromLocalFile(indexFile); 0182 qCDebug(WEBARCHIVERPLUGIN_LOG) << "indexUrl" << indexUrl; 0183 0184 #ifdef THUMBNAIL_USE_WEBKIT 0185 QWebView view; 0186 connect(&view, &QWebView::loadFinished, this, &WebArchiveCreator::slotLoadFinished); 0187 0188 QWebSettings *settings = view.settings(); 0189 settings->setThirdPartyCookiePolicy(QWebSettings::AlwaysBlockThirdPartyCookies); 0190 settings->setAttribute(QWebSettings::LocalContentCanAccessRemoteUrls, false); 0191 settings->setAttribute(QWebSettings::LocalContentCanAccessFileUrls, true); 0192 settings->setAttribute(QWebSettings::ZoomTextOnly, false); 0193 settings->setAttribute(QWebSettings::PrivateBrowsingEnabled, true); 0194 settings->setAttribute(QWebSettings::NotificationsEnabled, false); 0195 settings->setAttribute(QWebSettings::JavascriptEnabled, false); 0196 settings->setAttribute(QWebSettings::JavaEnabled, false); 0197 settings->setAttribute(QWebSettings::LocalStorageEnabled, false); 0198 settings->setAttribute(QWebSettings::LocalContentCanAccessRemoteUrls, false); 0199 settings->setAttribute(QWebSettings::PluginsEnabled, false); 0200 settings->setAttribute(QWebSettings::AllowRunningInsecureContent, false); 0201 settings->setAttribute(QWebSettings::PrintElementBackgrounds, true); 0202 settings->setAttribute(QWebSettings::PrivateBrowsingEnabled, true); 0203 0204 QWebPage *page = view.page(); 0205 auto *cookieJar = new WebArchiveCreatorCookieJar; 0206 page->networkAccessManager()->setCookieJar(cookieJar); 0207 #else // THUMBNAIL_USE_WEBKIT 0208 QWebEngineView view; 0209 connect(&view, &QWebEngineView::loadFinished, this, &WebArchiveCreator::slotLoadFinished); 0210 0211 QWebEngineSettings *settings = view.settings(); 0212 settings->setUnknownUrlSchemePolicy(QWebEngineSettings::DisallowUnknownUrlSchemes); 0213 settings->setAttribute(QWebEngineSettings::JavascriptEnabled, false); 0214 settings->setAttribute(QWebEngineSettings::LocalStorageEnabled, false); 0215 settings->setAttribute(QWebEngineSettings::LocalContentCanAccessRemoteUrls, false); 0216 settings->setAttribute(QWebEngineSettings::PluginsEnabled, false); 0217 settings->setAttribute(QWebEngineSettings::AutoLoadIconsForPage, false); 0218 settings->setAttribute(QWebEngineSettings::AllowRunningInsecureContent, false); 0219 settings->setAttribute(QWebEngineSettings::ShowScrollBars, false); 0220 settings->setAttribute(QWebEngineSettings::PdfViewerEnabled, false); 0221 settings->setAttribute(QWebEngineSettings::PrintElementBackgrounds, true); 0222 0223 QWebEnginePage *page = view.page(); 0224 QWebEngineProfile *profile = page->profile(); 0225 profile->setPersistentCookiesPolicy(QWebEngineProfile::NoPersistentCookies); 0226 profile->setSpellCheckEnabled(false); 0227 profile->cookieStore()->setCookieFilter(&disallowWebEngineCookies); 0228 #endif // THUMBNAIL_USE_WEBKIT 0229 0230 view.resize(c_pixmapSize); 0231 view.setZoomFactor(c_renderScale); // 0.25 is the minimum allowed 0232 0233 m_error = false; 0234 m_rendered = false; 0235 0236 view.load(indexUrl); 0237 #ifndef SHOW_RENDER_WINDOW 0238 view.setAttribute(Qt::WA_ShowWithoutActivating); 0239 view.setAttribute(Qt::WA_OutsideWSRange); 0240 view.setWindowFlags(view.windowFlags()|Qt::BypassWindowManagerHint|Qt::FramelessWindowHint); 0241 view.move(5000, 5000); 0242 #endif 0243 view.show(); 0244 0245 QTimer::singleShot(c_completionTimeout, this, &WebArchiveCreator::slotProcessingTimeout); 0246 while (!m_error && !m_rendered) qApp->processEvents(QEventLoop::WaitForMoreEvents); 0247 qCDebug(WEBARCHIVERPLUGIN_LOG) << "finished loop error?" << m_error; 0248 if (m_error) return (KIO::ThumbnailResult::fail()); // load error or timeout 0249 0250 // Render the HTML page on a bigger pixmap and leave the scaling to the 0251 // caller. Looks better than directly scaling with the QPainter (malte). 0252 QSize pixSize = c_pixmapSize; 0253 if (pixSize.width()<width || pixSize.height()<height) 0254 { // default size is too small 0255 if ((height*3)>(width*4)) pixSize = QSize(width, (width*4)/3); 0256 else pixSize = QSize((height*3)/4, height); 0257 } 0258 0259 QPixmap pix(pixSize); 0260 // First fill the pixmap with a light grey background, in case the 0261 // rendered page does not completely cover it. If there was an error 0262 // then we will already have given up above. 0263 pix.fill(QColor(245, 245, 245)); 0264 0265 view.render(&pix); // render the view into the pixmap 0266 view.hide(); // finished with the view and page 0267 #ifdef THUMBNAIL_USE_WEBKIT 0268 page->setVisibilityState(QWebPage::VisibilityStateHidden); 0269 #else // THUMBNAIL_USE_WEBKIT 0270 0271 #if QT_VERSION >= QT_VERSION_CHECK(5, 14, 0) 0272 page->setLifecycleState(QWebEnginePage::LifecycleState::Discarded); 0273 #endif // QT_VERSION 0274 #endif // THUMBNAIL_USE_WEBKIT 0275 0276 return KIO::ThumbnailResult::pass(pix.toImage()); 0277 } 0278 0279 void WebArchiveCreator::slotLoadFinished(bool ok) 0280 { 0281 qCDebug(WEBARCHIVERPLUGIN_LOG) << "ok?" << ok; 0282 if (!ok) 0283 { 0284 // If WebKit is being used, it is possible that 'ok' can be false 0285 // here even if the page load succeeded but it could only be 0286 // partially rendered (for example, a broken image source link). 0287 // Ignore the error indication and render the page anyway. 0288 #ifndef THUMBNAIL_USE_WEBKIT 0289 m_error = true; 0290 return; 0291 #endif // THUMBNAIL_USE_WEBKIT 0292 } 0293 0294 #ifdef THUMBNAIL_USE_WEBKIT 0295 // WebKit will have finished rendering when the loadFinished() signal has been 0296 // delivered. Render the bitmap immediately. 0297 slotRenderTimer(); 0298 #else // THUMBNAIL_USE_WEBKIT 0299 // WebEngine renders asynchronously after the loadFinished() signal has been 0300 // delivered. It is not possible to tell when page rendering has finished, so 0301 // a timer is used and the page is assumed to be ready when it expires. 0302 QTimer::singleShot(c_renderTimeout, this, &WebArchiveCreator::slotRenderTimer); 0303 #endif // THUMBNAIL_USE_WEBKIT 0304 } 0305 0306 0307 void WebArchiveCreator::slotProcessingTimeout() 0308 { 0309 m_error = true; 0310 } 0311 0312 0313 void WebArchiveCreator::slotRenderTimer() 0314 { 0315 m_rendered = true; 0316 } 0317 0318 0319 #ifdef THUMBNAIL_USE_WEBKIT 0320 0321 // WebArchiveCreatorCookieJar 0322 // 0323 // A cookie jar that ignores any cookies sent to it and never 0324 // delivers any. 0325 0326 WebArchiveCreatorCookieJar::WebArchiveCreatorCookieJar(QObject *parent) 0327 : QNetworkCookieJar(parent) 0328 { 0329 } 0330 0331 QList<QNetworkCookie> WebArchiveCreatorCookieJar::cookiesForUrl(const QUrl &url) const 0332 { 0333 return (QList<QNetworkCookie>()); 0334 } 0335 0336 bool WebArchiveCreatorCookieJar::insertCookie(const QNetworkCookie &cookie) 0337 { 0338 return (false); 0339 } 0340 0341 0342 bool WebArchiveCreatorCookieJar::setCookiesFromUrl(const QList<QNetworkCookie> &cookieList, const QUrl &url) 0343 { 0344 return (false); 0345 } 0346 0347 #endif // THUMBNAIL_USE_WEBKIT