File indexing completed on 2024-12-29 04:51:00

0001 /*
0002    SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
0003 
0004    SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "htmldocumentprocessor.h"
0008 
0009 #include "genericpriceextractorhelper_p.h"
0010 #include "logging.h"
0011 #include "stringutil.h"
0012 #include "json/jsonld.h"
0013 
0014 #include <KItinerary/ExtractorDocumentNodeFactory>
0015 #include <KItinerary/ExtractorEngine>
0016 #include <KItinerary/ExtractorResult>
0017 #include <KItinerary/HtmlDocument>
0018 #include <KItinerary/JsonLdDocument>
0019 
0020 #include <QJsonArray>
0021 #include <QJsonDocument>
0022 #include <QJsonObject>
0023 #include <QJSEngine>
0024 #include <QJSValue>
0025 #include <QString>
0026 #include <QUrl>
0027 
0028 #include <cmath>
0029 
0030 using namespace KItinerary;
0031 
0032 Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::HtmlDocument>)
0033 
0034 bool HtmlDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
0035 {
0036   return StringUtil::startsWithIgnoreSpace(encodedData, "<") ||
0037          fileName.endsWith(QLatin1StringView(".html"), Qt::CaseInsensitive) ||
0038          fileName.endsWith(QLatin1StringView(".htm"), Qt::CaseInsensitive);
0039 }
0040 
0041 static ExtractorDocumentNode nodeFromHtml(HtmlDocument *html)
0042 {
0043     if (!html || html->root().firstChild().isNull()) {
0044         return {};
0045     }
0046 
0047     ExtractorDocumentNode node;
0048     node.setContent<Internal::OwnedPtr<HtmlDocument>>(html);
0049     return node;
0050 }
0051 
0052 ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const
0053 {
0054     return nodeFromHtml(HtmlDocument::fromData(encodedData));
0055 }
0056 
0057 ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromContent(const QVariant &decodedData) const
0058 {
0059     if (decodedData.userType() == QMetaType::QString) {
0060         return nodeFromHtml(HtmlDocument::fromString(decodedData.toString()));
0061     }
0062     return ExtractorDocumentProcessor::createNodeFromContent(decodedData);
0063 }
0064 
0065 void HtmlDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const
0066 {
0067     const auto html = node.content<HtmlDocument*>();
0068 
0069     // inline images
0070     expandElementRecursive(node, html->root(), engine);
0071 
0072     // plain text fallback node
0073     auto fallback = engine->documentNodeFactory()->createNode(html->root().recursiveContent(), u"text/plain");
0074     node.appendChild(fallback);
0075 }
0076 
0077 static bool isJsonLdTag(const HtmlElement &elem)
0078 {
0079   return elem.name() == QLatin1StringView("script") &&
0080          elem.attribute(QStringLiteral("type")) ==
0081              QLatin1String("application/ld+json");
0082 }
0083 
0084 static QByteArray fixupJson(const QByteArray &data)
0085 {
0086     if (data.isEmpty()) {
0087         return {};
0088     }
0089     auto output(data);
0090 
0091     // Eurowings doesn't put a comma between objects in top-level arrays...
0092     output.replace("}{", "},{");
0093 
0094     // Volotea doesn't put square brackets in top level arrays...
0095     if (output.front() != '[' && output.back() != ']') {
0096         output.prepend("[");
0097         output.append("]");
0098     }
0099 
0100     // Eventbrite adds commas where there shouldn't be one...
0101     for (qsizetype idx = output.indexOf("\",\n"); idx > 0 && idx + 3 < output.size(); idx = output.indexOf("\",\n", idx)) {
0102         const auto comma = idx + 1;
0103         idx += 3;
0104         while (idx < output.size() && std::isspace(static_cast<unsigned char>(output[idx]))) {
0105             ++idx;
0106         }
0107         if (idx < output.size() && output[idx] == '}') {
0108             output[comma] = ' ';
0109         }
0110     }
0111 
0112     // Airbnb applies XML entity encoding...
0113     output.replace("&quot;", "\"");
0114 
0115     return output;
0116 }
0117 
0118 static void parseJson(const QByteArray &data, QJsonArray &result)
0119 {
0120     QJsonParseError error;
0121     auto jsonDoc = QJsonDocument::fromJson(data, &error);
0122     if (jsonDoc.isNull()) {
0123         if (error.error != QJsonParseError::NoError) {
0124             // try to fix up common JSON encoding errors
0125             jsonDoc = QJsonDocument::fromJson(fixupJson(data));
0126         }
0127         if (jsonDoc.isNull()) {
0128             qCDebug(Log).noquote() << data;
0129             qCDebug(Log) << error.errorString() << "at offset" << error.offset;
0130             return;
0131         }
0132     }
0133     if (jsonDoc.isArray()) {
0134         const auto jsonArray = jsonDoc.array();
0135         std::copy(jsonArray.begin(), jsonArray.end(), std::back_inserter(result));
0136     } else if (jsonDoc.isObject()) {
0137         result.push_back(jsonDoc.object());
0138     }
0139 }
0140 
0141 static QString valueForItemProperty(const HtmlElement &elem)
0142 {
0143     // TODO see https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop#Values
0144     const auto elemName = elem.name();
0145     QString v;
0146     if (elemName == QLatin1StringView("meta")) {
0147       v = elem.attribute(QStringLiteral("content"));
0148     } else if (elemName == QLatin1StringView("time")) {
0149       v = elem.attribute(QStringLiteral("datetime"));
0150     } else if (elemName == QLatin1StringView("link") ||
0151                elemName == QLatin1Char('a') ||
0152                elemName == QLatin1String("img")) {
0153       if (elem.hasAttribute(QStringLiteral("href"))) {
0154         v = elem.attribute(QStringLiteral("href"));
0155       } else if (elem.hasAttribute(QStringLiteral("content"))) {
0156         v = elem.attribute(QStringLiteral("content"));
0157       } else if (elem.hasAttribute(QStringLiteral("src"))) {
0158         v = elem.attribute(QStringLiteral("src"));
0159       } else {
0160         v = elem.recursiveContent();
0161       }
0162     } else {
0163       v = elem.recursiveContent();
0164     }
0165 
0166     return v;
0167 }
0168 
0169 static void insertProperties(QJsonObject &obj, const QString &prop, const QJsonValue &val)
0170 {
0171     // multiple properties can be specified at once, as a space-separated list
0172     const auto props = prop.split(QLatin1Char(' '), Qt::SkipEmptyParts);
0173     for (const auto &p : props) {
0174         auto valRef = obj[p];
0175         if (valRef.isUndefined() || valRef.isNull()) {
0176             obj.insert(p, val);
0177         // convert multiple repeated properties into an array
0178         } else if (valRef.isArray()) {
0179             auto array = valRef.toArray();
0180             array.push_back(val);
0181             valRef = array;
0182         } else {
0183             QJsonArray array({valRef, val});
0184             valRef = array;
0185         }
0186     }
0187 }
0188 
0189 static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray &result)
0190 {
0191     auto child = elem.firstChild();
0192     while (!child.isNull()) {
0193         const auto prop = child.attribute(QStringLiteral("itemprop"));
0194         const auto type = child.attribute(QStringLiteral("itemtype"));
0195         if (JsonLd::isSchemaOrgNamespace(type)) {
0196             QJsonObject subObj;
0197             parseMicroData(child, subObj, result);
0198             const QUrl typeUrl(type);
0199             subObj.insert(QStringLiteral("@type"), typeUrl.fileName());
0200             if (prop.isEmpty()) {
0201                 result.push_back(subObj); // stand-alone object that just happens to be nested
0202             } else {
0203                 insertProperties(obj, prop, subObj);
0204             }
0205         } else if (!prop.isEmpty()) {
0206             insertProperties(obj, prop, valueForItemProperty(child));
0207         // Maybe there is more JSON-LD inside this microdata tree
0208         } else if (isJsonLdTag(child)) {
0209             parseJson(child.content().toUtf8(), result);
0210         } else {
0211             // skip intermediate nodes without Microdata annotations
0212             parseMicroData(child, obj, result);
0213         }
0214         child = child.nextSibling();
0215     }
0216 }
0217 
0218 static void extractRecursive(const HtmlElement &elem, QJsonArray &result)
0219 {
0220     // JSON-LD
0221     if (isJsonLdTag(elem)) {
0222         parseJson(elem.content().toUtf8(), result);
0223         return;
0224     }
0225 
0226     // Microdata
0227     const auto itemType = elem.attribute(QStringLiteral("itemtype"));
0228     if (JsonLd::isSchemaOrgNamespace(itemType)) {
0229         QJsonObject obj;
0230         parseMicroData(elem, obj, result);
0231         if (obj.isEmpty()) {
0232             return;
0233         }
0234 
0235         const QUrl typeUrl(itemType);
0236         obj.insert(QStringLiteral("@type"), typeUrl.fileName());
0237 
0238         const auto itemProp = elem.attribute(QStringLiteral("itemprop"));
0239         if (!itemProp.isEmpty() && !result.isEmpty()) {
0240             // this is likely a child of our preceding sibling, but broken XML put it here
0241             auto parent = result.last().toObject();
0242             parent.insert(itemProp, obj);
0243             result[result.size() - 1] = parent;
0244         } else {
0245             obj.insert(QStringLiteral("@context"), QStringLiteral("http://schema.org"));
0246             result.push_back(obj);
0247         }
0248         return;
0249     }
0250 
0251     // recurse otherwise
0252     auto child = elem.firstChild();
0253     while (!child.isNull()) {
0254         extractRecursive(child, result);
0255         child = child.nextSibling();
0256     }
0257 }
0258 
0259 void HtmlDocumentProcessor::preExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
0260 {
0261     auto doc = node.content<HtmlDocument*>();
0262     Q_ASSERT(doc);
0263 
0264     if (!doc->root().isNull()) {
0265         QJsonArray result;
0266         extractRecursive(doc->root(), result);
0267         node.addResult(result);
0268     }
0269 }
0270 
0271 void HtmlDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
0272 {
0273     if (node.childNodes().empty() || node.result().isEmpty()) {
0274         return;
0275     }
0276 
0277     const QString text = node.childNodes().back().content<QString>();
0278     GenericPriceExtractorHelper::postExtract(text, node);
0279 }
0280 
0281 QJSValue HtmlDocumentProcessor::contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const
0282 {
0283     return engine->toScriptValue(node.content<HtmlDocument*>());
0284 }
0285 
0286 void HtmlDocumentProcessor::destroyNode(ExtractorDocumentNode &node) const
0287 {
0288     destroyIfOwned<HtmlDocument>(node);
0289 }
0290 
0291 void HtmlDocumentProcessor::expandElementRecursive(ExtractorDocumentNode &node, const HtmlElement &elem, const ExtractorEngine *engine) const
0292 {
0293   if (elem.name() == QLatin1StringView("img")) {
0294     const auto src = elem.attribute(QLatin1StringView("src"));
0295     if (src.startsWith(QLatin1StringView("data:"))) {
0296       expandDataUrl(node, src, engine);
0297     }
0298   }
0299 
0300     auto child = elem.firstChild();
0301     while (!child.isNull()) {
0302         expandElementRecursive(node, child, engine);
0303         child = child.nextSibling();
0304     }
0305 }
0306 
0307 void HtmlDocumentProcessor::expandDataUrl(ExtractorDocumentNode &node, QStringView data, const ExtractorEngine *engine) const
0308 {
0309     const auto idx = data.indexOf(QLatin1Char(','));
0310     if (idx < 0) {
0311         return;
0312     }
0313     const auto header = data.mid(5, idx - 5);
0314     const auto headerItems = header.split(QLatin1Char(';'));
0315     if (headerItems.isEmpty()) {
0316         return;
0317     }
0318 
0319     if (headerItems.front() != QLatin1StringView("image/png")) {
0320       return;
0321     }
0322 
0323     auto imgData = data.mid(idx).toUtf8();
0324     if (headerItems.back() == QLatin1StringView("base64")) {
0325       imgData = QByteArray::fromBase64(imgData.trimmed());
0326     }
0327 
0328     auto child = engine->documentNodeFactory()->createNode(imgData, {}, headerItems.front());
0329     node.appendChild(child);
0330 }