File indexing completed on 2024-12-29 04:51:00
0001 /* 0002 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "htmldocumentprocessor.h" 0008 0009 #include "genericpriceextractorhelper_p.h" 0010 #include "logging.h" 0011 #include "stringutil.h" 0012 #include "json/jsonld.h" 0013 0014 #include <KItinerary/ExtractorDocumentNodeFactory> 0015 #include <KItinerary/ExtractorEngine> 0016 #include <KItinerary/ExtractorResult> 0017 #include <KItinerary/HtmlDocument> 0018 #include <KItinerary/JsonLdDocument> 0019 0020 #include <QJsonArray> 0021 #include <QJsonDocument> 0022 #include <QJsonObject> 0023 #include <QJSEngine> 0024 #include <QJSValue> 0025 #include <QString> 0026 #include <QUrl> 0027 0028 #include <cmath> 0029 0030 using namespace KItinerary; 0031 0032 Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::HtmlDocument>) 0033 0034 bool HtmlDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const 0035 { 0036 return StringUtil::startsWithIgnoreSpace(encodedData, "<") || 0037 fileName.endsWith(QLatin1StringView(".html"), Qt::CaseInsensitive) || 0038 fileName.endsWith(QLatin1StringView(".htm"), Qt::CaseInsensitive); 0039 } 0040 0041 static ExtractorDocumentNode nodeFromHtml(HtmlDocument *html) 0042 { 0043 if (!html || html->root().firstChild().isNull()) { 0044 return {}; 0045 } 0046 0047 ExtractorDocumentNode node; 0048 node.setContent<Internal::OwnedPtr<HtmlDocument>>(html); 0049 return node; 0050 } 0051 0052 ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const 0053 { 0054 return nodeFromHtml(HtmlDocument::fromData(encodedData)); 0055 } 0056 0057 ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromContent(const QVariant &decodedData) const 0058 { 0059 if (decodedData.userType() == QMetaType::QString) { 0060 return nodeFromHtml(HtmlDocument::fromString(decodedData.toString())); 0061 } 0062 return ExtractorDocumentProcessor::createNodeFromContent(decodedData); 0063 } 0064 0065 void HtmlDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const 0066 { 0067 const auto html = node.content<HtmlDocument*>(); 0068 0069 // inline images 0070 expandElementRecursive(node, html->root(), engine); 0071 0072 // plain text fallback node 0073 auto fallback = engine->documentNodeFactory()->createNode(html->root().recursiveContent(), u"text/plain"); 0074 node.appendChild(fallback); 0075 } 0076 0077 static bool isJsonLdTag(const HtmlElement &elem) 0078 { 0079 return elem.name() == QLatin1StringView("script") && 0080 elem.attribute(QStringLiteral("type")) == 0081 QLatin1String("application/ld+json"); 0082 } 0083 0084 static QByteArray fixupJson(const QByteArray &data) 0085 { 0086 if (data.isEmpty()) { 0087 return {}; 0088 } 0089 auto output(data); 0090 0091 // Eurowings doesn't put a comma between objects in top-level arrays... 0092 output.replace("}{", "},{"); 0093 0094 // Volotea doesn't put square brackets in top level arrays... 0095 if (output.front() != '[' && output.back() != ']') { 0096 output.prepend("["); 0097 output.append("]"); 0098 } 0099 0100 // Eventbrite adds commas where there shouldn't be one... 0101 for (qsizetype idx = output.indexOf("\",\n"); idx > 0 && idx + 3 < output.size(); idx = output.indexOf("\",\n", idx)) { 0102 const auto comma = idx + 1; 0103 idx += 3; 0104 while (idx < output.size() && std::isspace(static_cast<unsigned char>(output[idx]))) { 0105 ++idx; 0106 } 0107 if (idx < output.size() && output[idx] == '}') { 0108 output[comma] = ' '; 0109 } 0110 } 0111 0112 // Airbnb applies XML entity encoding... 0113 output.replace(""", "\""); 0114 0115 return output; 0116 } 0117 0118 static void parseJson(const QByteArray &data, QJsonArray &result) 0119 { 0120 QJsonParseError error; 0121 auto jsonDoc = QJsonDocument::fromJson(data, &error); 0122 if (jsonDoc.isNull()) { 0123 if (error.error != QJsonParseError::NoError) { 0124 // try to fix up common JSON encoding errors 0125 jsonDoc = QJsonDocument::fromJson(fixupJson(data)); 0126 } 0127 if (jsonDoc.isNull()) { 0128 qCDebug(Log).noquote() << data; 0129 qCDebug(Log) << error.errorString() << "at offset" << error.offset; 0130 return; 0131 } 0132 } 0133 if (jsonDoc.isArray()) { 0134 const auto jsonArray = jsonDoc.array(); 0135 std::copy(jsonArray.begin(), jsonArray.end(), std::back_inserter(result)); 0136 } else if (jsonDoc.isObject()) { 0137 result.push_back(jsonDoc.object()); 0138 } 0139 } 0140 0141 static QString valueForItemProperty(const HtmlElement &elem) 0142 { 0143 // TODO see https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop#Values 0144 const auto elemName = elem.name(); 0145 QString v; 0146 if (elemName == QLatin1StringView("meta")) { 0147 v = elem.attribute(QStringLiteral("content")); 0148 } else if (elemName == QLatin1StringView("time")) { 0149 v = elem.attribute(QStringLiteral("datetime")); 0150 } else if (elemName == QLatin1StringView("link") || 0151 elemName == QLatin1Char('a') || 0152 elemName == QLatin1String("img")) { 0153 if (elem.hasAttribute(QStringLiteral("href"))) { 0154 v = elem.attribute(QStringLiteral("href")); 0155 } else if (elem.hasAttribute(QStringLiteral("content"))) { 0156 v = elem.attribute(QStringLiteral("content")); 0157 } else if (elem.hasAttribute(QStringLiteral("src"))) { 0158 v = elem.attribute(QStringLiteral("src")); 0159 } else { 0160 v = elem.recursiveContent(); 0161 } 0162 } else { 0163 v = elem.recursiveContent(); 0164 } 0165 0166 return v; 0167 } 0168 0169 static void insertProperties(QJsonObject &obj, const QString &prop, const QJsonValue &val) 0170 { 0171 // multiple properties can be specified at once, as a space-separated list 0172 const auto props = prop.split(QLatin1Char(' '), Qt::SkipEmptyParts); 0173 for (const auto &p : props) { 0174 auto valRef = obj[p]; 0175 if (valRef.isUndefined() || valRef.isNull()) { 0176 obj.insert(p, val); 0177 // convert multiple repeated properties into an array 0178 } else if (valRef.isArray()) { 0179 auto array = valRef.toArray(); 0180 array.push_back(val); 0181 valRef = array; 0182 } else { 0183 QJsonArray array({valRef, val}); 0184 valRef = array; 0185 } 0186 } 0187 } 0188 0189 static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray &result) 0190 { 0191 auto child = elem.firstChild(); 0192 while (!child.isNull()) { 0193 const auto prop = child.attribute(QStringLiteral("itemprop")); 0194 const auto type = child.attribute(QStringLiteral("itemtype")); 0195 if (JsonLd::isSchemaOrgNamespace(type)) { 0196 QJsonObject subObj; 0197 parseMicroData(child, subObj, result); 0198 const QUrl typeUrl(type); 0199 subObj.insert(QStringLiteral("@type"), typeUrl.fileName()); 0200 if (prop.isEmpty()) { 0201 result.push_back(subObj); // stand-alone object that just happens to be nested 0202 } else { 0203 insertProperties(obj, prop, subObj); 0204 } 0205 } else if (!prop.isEmpty()) { 0206 insertProperties(obj, prop, valueForItemProperty(child)); 0207 // Maybe there is more JSON-LD inside this microdata tree 0208 } else if (isJsonLdTag(child)) { 0209 parseJson(child.content().toUtf8(), result); 0210 } else { 0211 // skip intermediate nodes without Microdata annotations 0212 parseMicroData(child, obj, result); 0213 } 0214 child = child.nextSibling(); 0215 } 0216 } 0217 0218 static void extractRecursive(const HtmlElement &elem, QJsonArray &result) 0219 { 0220 // JSON-LD 0221 if (isJsonLdTag(elem)) { 0222 parseJson(elem.content().toUtf8(), result); 0223 return; 0224 } 0225 0226 // Microdata 0227 const auto itemType = elem.attribute(QStringLiteral("itemtype")); 0228 if (JsonLd::isSchemaOrgNamespace(itemType)) { 0229 QJsonObject obj; 0230 parseMicroData(elem, obj, result); 0231 if (obj.isEmpty()) { 0232 return; 0233 } 0234 0235 const QUrl typeUrl(itemType); 0236 obj.insert(QStringLiteral("@type"), typeUrl.fileName()); 0237 0238 const auto itemProp = elem.attribute(QStringLiteral("itemprop")); 0239 if (!itemProp.isEmpty() && !result.isEmpty()) { 0240 // this is likely a child of our preceding sibling, but broken XML put it here 0241 auto parent = result.last().toObject(); 0242 parent.insert(itemProp, obj); 0243 result[result.size() - 1] = parent; 0244 } else { 0245 obj.insert(QStringLiteral("@context"), QStringLiteral("http://schema.org")); 0246 result.push_back(obj); 0247 } 0248 return; 0249 } 0250 0251 // recurse otherwise 0252 auto child = elem.firstChild(); 0253 while (!child.isNull()) { 0254 extractRecursive(child, result); 0255 child = child.nextSibling(); 0256 } 0257 } 0258 0259 void HtmlDocumentProcessor::preExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const 0260 { 0261 auto doc = node.content<HtmlDocument*>(); 0262 Q_ASSERT(doc); 0263 0264 if (!doc->root().isNull()) { 0265 QJsonArray result; 0266 extractRecursive(doc->root(), result); 0267 node.addResult(result); 0268 } 0269 } 0270 0271 void HtmlDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const 0272 { 0273 if (node.childNodes().empty() || node.result().isEmpty()) { 0274 return; 0275 } 0276 0277 const QString text = node.childNodes().back().content<QString>(); 0278 GenericPriceExtractorHelper::postExtract(text, node); 0279 } 0280 0281 QJSValue HtmlDocumentProcessor::contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const 0282 { 0283 return engine->toScriptValue(node.content<HtmlDocument*>()); 0284 } 0285 0286 void HtmlDocumentProcessor::destroyNode(ExtractorDocumentNode &node) const 0287 { 0288 destroyIfOwned<HtmlDocument>(node); 0289 } 0290 0291 void HtmlDocumentProcessor::expandElementRecursive(ExtractorDocumentNode &node, const HtmlElement &elem, const ExtractorEngine *engine) const 0292 { 0293 if (elem.name() == QLatin1StringView("img")) { 0294 const auto src = elem.attribute(QLatin1StringView("src")); 0295 if (src.startsWith(QLatin1StringView("data:"))) { 0296 expandDataUrl(node, src, engine); 0297 } 0298 } 0299 0300 auto child = elem.firstChild(); 0301 while (!child.isNull()) { 0302 expandElementRecursive(node, child, engine); 0303 child = child.nextSibling(); 0304 } 0305 } 0306 0307 void HtmlDocumentProcessor::expandDataUrl(ExtractorDocumentNode &node, QStringView data, const ExtractorEngine *engine) const 0308 { 0309 const auto idx = data.indexOf(QLatin1Char(',')); 0310 if (idx < 0) { 0311 return; 0312 } 0313 const auto header = data.mid(5, idx - 5); 0314 const auto headerItems = header.split(QLatin1Char(';')); 0315 if (headerItems.isEmpty()) { 0316 return; 0317 } 0318 0319 if (headerItems.front() != QLatin1StringView("image/png")) { 0320 return; 0321 } 0322 0323 auto imgData = data.mid(idx).toUtf8(); 0324 if (headerItems.back() == QLatin1StringView("base64")) { 0325 imgData = QByteArray::fromBase64(imgData.trimmed()); 0326 } 0327 0328 auto child = engine->documentNodeFactory()->createNode(imgData, {}, headerItems.front()); 0329 node.appendChild(child); 0330 }