Warning, file /pim/kitinerary/src/lib/htmldocument.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 /* 0002 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "config-kitinerary.h" 0008 #include "htmldocument.h" 0009 0010 #include <QDebug> 0011 #include <QVariant> 0012 0013 #if HAVE_LIBXML2 0014 #include <libxml/HTMLparser.h> 0015 #include <libxml/xpath.h> 0016 #endif 0017 0018 using namespace KItinerary; 0019 0020 namespace KItinerary { 0021 class HtmlDocumentPrivate { 0022 public: 0023 #if HAVE_LIBXML2 0024 ~HtmlDocumentPrivate() { 0025 xmlFreeDoc(m_doc); 0026 } 0027 0028 xmlDocPtr m_doc; 0029 QByteArray m_rawData; 0030 #endif 0031 }; 0032 } 0033 0034 HtmlElement::HtmlElement() 0035 : d(nullptr) 0036 { 0037 } 0038 0039 HtmlElement::~HtmlElement() = default; 0040 0041 #if HAVE_LIBXML2 0042 HtmlElement::HtmlElement(xmlNode *dd) 0043 : d(dd) 0044 { 0045 } 0046 #endif 0047 0048 HtmlDocument::HtmlDocument(QObject *parent) 0049 : QObject(parent) 0050 , d(new HtmlDocumentPrivate) 0051 { 0052 } 0053 0054 HtmlDocument::~HtmlDocument() = default; 0055 0056 bool HtmlElement::isNull() const 0057 { 0058 return d == nullptr; 0059 } 0060 0061 QString HtmlElement::name() const 0062 { 0063 #if HAVE_LIBXML2 0064 if (d) { 0065 return QString::fromUtf8(reinterpret_cast<const char*>(d->name)); 0066 } 0067 #endif 0068 return {}; 0069 } 0070 0071 QString HtmlElement::attribute(const QString &attr) const 0072 { 0073 #if HAVE_LIBXML2 0074 if (d) { 0075 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlGetProp(d, reinterpret_cast<const xmlChar*>(attr.toUtf8().constData())), xmlFree); 0076 return QString::fromUtf8(reinterpret_cast<const char*>(val.get())); 0077 } 0078 #else 0079 Q_UNUSED(attr) 0080 #endif 0081 return {}; 0082 } 0083 0084 HtmlElement HtmlElement::parent() const 0085 { 0086 #if HAVE_LIBXML2 0087 if (d && d->parent && d->parent->type == XML_ELEMENT_NODE) { 0088 return HtmlElement(d->parent); 0089 } 0090 #endif 0091 return {}; 0092 } 0093 0094 HtmlElement HtmlElement::firstChild() const 0095 { 0096 #if HAVE_LIBXML2 0097 if (d) { 0098 return HtmlElement(xmlFirstElementChild(d)); 0099 } 0100 #endif 0101 return {}; 0102 } 0103 0104 HtmlElement HtmlElement::nextSibling() const 0105 { 0106 #if HAVE_LIBXML2 0107 if (d) { 0108 return HtmlElement(xmlNextElementSibling(d)); 0109 } 0110 #endif 0111 return {}; 0112 } 0113 0114 #if HAVE_LIBXML2 0115 static void normalizingAppend(QString &out, const QString &in) 0116 { 0117 if (in.isEmpty()) { 0118 return; 0119 } 0120 0121 const bool needsLeadingSpace = !out.isEmpty() && !out.back().isSpace(); 0122 out.reserve(out.size() + in.size() + (needsLeadingSpace ? 1 : 0)); 0123 if (needsLeadingSpace) { 0124 out.push_back(QChar::Space); 0125 } 0126 0127 // convert non-breaking spaces and windows line break to normal ones, technically not correct 0128 // but way too often this confuses our regular expressions 0129 bool leadingTrim = true; 0130 bool foundCR = false; 0131 for (const auto c : in) { 0132 // trim leading spaces while we are at it 0133 if (leadingTrim && c.isSpace()) { 0134 continue; 0135 } 0136 leadingTrim = false; 0137 0138 // normalize CRs 0139 if (c == QChar::CarriageReturn) { 0140 foundCR = true; 0141 continue; 0142 } 0143 if (foundCR && c != QChar::LineFeed) { 0144 out.push_back(QChar::LineFeed); 0145 } 0146 foundCR = false; 0147 0148 // normalize space variations 0149 if (c == QChar::Nbsp) { 0150 out.push_back(QChar::Space); 0151 } else { 0152 out.push_back(c); 0153 } 0154 } 0155 } 0156 0157 static void normalizingLineBreakAppend(QString &s) 0158 { 0159 s = s.trimmed(); 0160 s.push_back(QChar::LineFeed); 0161 } 0162 #endif 0163 0164 QString HtmlElement::content() const 0165 { 0166 #if HAVE_LIBXML2 0167 if (!d) { 0168 return {}; 0169 } 0170 0171 QString s; 0172 auto node = d->children; 0173 while (node) { 0174 switch (node->type) { 0175 case XML_TEXT_NODE: 0176 case XML_CDATA_SECTION_NODE: 0177 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content))); 0178 break; 0179 case XML_ENTITY_REF_NODE: 0180 { 0181 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree); 0182 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get()))); 0183 break; 0184 } 0185 case XML_ELEMENT_NODE: 0186 if (qstricmp(reinterpret_cast<const char*>(node->name), "br") == 0) { 0187 s += QLatin1Char('\n'); 0188 } 0189 break; 0190 default: 0191 break; 0192 0193 } 0194 node = node->next; 0195 } 0196 0197 return s.trimmed(); // trailing trim can be done without copying 0198 #endif 0199 return {}; 0200 } 0201 0202 #if HAVE_LIBXML2 0203 static void recursiveContent(_xmlNode *node, QString &s) 0204 { 0205 switch (node->type) { 0206 case XML_TEXT_NODE: 0207 case XML_CDATA_SECTION_NODE: 0208 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content))); 0209 return; 0210 case XML_ENTITY_REF_NODE: 0211 { 0212 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree); 0213 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get()))); 0214 break; 0215 } 0216 case XML_ELEMENT_NODE: 0217 { 0218 if (qstricmp(reinterpret_cast<const char*>(node->name), "style") == 0) { 0219 return; 0220 } else if (qstricmp(reinterpret_cast<const char*>(node->name), "table") == 0) { 0221 normalizingLineBreakAppend(s); 0222 } 0223 break; 0224 } 0225 case XML_ATTRIBUTE_NODE: 0226 case XML_COMMENT_NODE: 0227 return; 0228 default: 0229 break; 0230 } 0231 0232 auto child = node->children; 0233 while (child) { 0234 recursiveContent(child, s); 0235 child = child->next; 0236 } 0237 0238 if (node->type == XML_ELEMENT_NODE) { 0239 for (const auto elemName : { "br", "p", "tr" }) { 0240 if (qstricmp(reinterpret_cast<const char*>(node->name), elemName) == 0) { 0241 normalizingLineBreakAppend(s); 0242 break; 0243 } 0244 } 0245 } 0246 } 0247 #endif 0248 0249 QString HtmlElement::recursiveContent() const 0250 { 0251 #if HAVE_LIBXML2 0252 if (!d) { 0253 return {}; 0254 } 0255 0256 QString s; 0257 ::recursiveContent(d, s); 0258 return s.trimmed(); // trailing trim can be done without copying 0259 #else 0260 return {}; 0261 #endif 0262 } 0263 0264 QVariant HtmlElement::eval(const QString &xpath) const 0265 { 0266 #if HAVE_LIBXML2 0267 if (!d) { 0268 return {}; 0269 } 0270 0271 const auto ctx = std::unique_ptr<xmlXPathContext, decltype(&xmlXPathFreeContext)>(xmlXPathNewContext(d->doc), &xmlXPathFreeContext); 0272 if (!ctx) { 0273 return {}; 0274 } 0275 xmlXPathSetContextNode(d, ctx.get()); 0276 const auto xpathObj = std::unique_ptr<xmlXPathObject, decltype(&xmlXPathFreeObject)>(xmlXPathEvalExpression(reinterpret_cast<const xmlChar*>(xpath.toUtf8().constData()), ctx.get()), &xmlXPathFreeObject); 0277 if (!xpathObj) { 0278 return {}; 0279 } 0280 0281 switch (xpathObj->type) { 0282 case XPATH_NODESET: 0283 { 0284 QVariantList l; 0285 if (!xpathObj->nodesetval) { 0286 return l; 0287 } 0288 l.reserve(xpathObj->nodesetval->nodeNr); 0289 for (int i = 0; i < xpathObj->nodesetval->nodeNr; ++i) { 0290 l.push_back(QVariant::fromValue<HtmlElement>(xpathObj->nodesetval->nodeTab[i])); 0291 } 0292 return l; 0293 } 0294 case XPATH_BOOLEAN: 0295 return QVariant::fromValue<bool>(xpathObj->boolval); 0296 case XPATH_NUMBER: 0297 return xpathObj->floatval; 0298 case XPATH_STRING: 0299 return QString::fromUtf8(reinterpret_cast<const char*>(xpathObj->stringval)); 0300 default: 0301 return {}; 0302 } 0303 #else 0304 Q_UNUSED(xpath) 0305 #endif 0306 return {}; 0307 } 0308 0309 bool HtmlElement::hasAttribute(const QString& attr) const 0310 { 0311 #if HAVE_LIBXML2 0312 if (!d) { 0313 return false; 0314 } 0315 0316 auto attribute = d->properties; 0317 while(attribute) 0318 { 0319 if (qstricmp(attr.toUtf8().constData(), reinterpret_cast<const char*>(attribute->name)) == 0) { 0320 return true; 0321 } 0322 attribute = attribute->next; 0323 } 0324 #else 0325 Q_UNUSED(attr) 0326 #endif 0327 return false; 0328 } 0329 0330 QStringList HtmlElement::attributes() const 0331 { 0332 QStringList l; 0333 #if HAVE_LIBXML2 0334 if (!d) { 0335 return l; 0336 } 0337 0338 auto attribute = d->properties; 0339 while(attribute) 0340 { 0341 l.push_back(QString::fromUtf8(reinterpret_cast<const char*>(attribute->name))); 0342 attribute = attribute->next; 0343 } 0344 #endif 0345 return l; 0346 } 0347 0348 bool HtmlElement::operator==(const HtmlElement &other) const 0349 { 0350 return d == other.d; 0351 } 0352 0353 0354 HtmlElement HtmlDocument::root() const 0355 { 0356 #if HAVE_LIBXML2 0357 if (!d->m_doc) { 0358 return {}; 0359 } 0360 return HtmlElement(xmlDocGetRootElement(d->m_doc)); 0361 #else 0362 return {}; 0363 #endif 0364 } 0365 0366 QString HtmlDocument::rawData() const 0367 { 0368 #if HAVE_LIBXML2 0369 return QString::fromUtf8(d->m_rawData); 0370 #else 0371 return {}; 0372 #endif 0373 } 0374 0375 QVariant HtmlDocument::eval(const QString &xpath) const 0376 { 0377 return root().eval(xpath); 0378 } 0379 0380 HtmlDocument* HtmlDocument::fromData(const QByteArray &data, QObject *parent) 0381 { 0382 #if HAVE_LIBXML2 0383 auto tree = htmlReadMemory(data.constData(), data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT); 0384 if (!tree) { 0385 return nullptr; 0386 } 0387 0388 auto doc = new HtmlDocument(parent); 0389 doc->d->m_doc = tree; 0390 doc->d->m_rawData = data; 0391 return doc; 0392 #else 0393 Q_UNUSED(data) 0394 Q_UNUSED(parent) 0395 return nullptr; 0396 #endif 0397 } 0398 0399 HtmlDocument* HtmlDocument::fromString(const QString &data, QObject *parent) 0400 { 0401 #if HAVE_LIBXML2 0402 auto utf8Data = data.toUtf8(); 0403 auto tree = htmlReadMemory(utf8Data.constData(), utf8Data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT); 0404 if (!tree) { 0405 return nullptr; 0406 } 0407 0408 auto doc = new HtmlDocument(parent); 0409 doc->d->m_doc = tree; 0410 doc->d->m_rawData = std::move(utf8Data); 0411 return doc; 0412 #else 0413 Q_UNUSED(data) 0414 Q_UNUSED(parent) 0415 return nullptr; 0416 #endif 0417 } 0418 0419 #include "moc_htmldocument.cpp"