Warning, file /pim/kitinerary/src/lib/htmldocument.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 /*
0002     SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
0003 
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "config-kitinerary.h"
0008 #include "htmldocument.h"
0009 
0010 #include <QDebug>
0011 #include <QVariant>
0012 
0013 #if HAVE_LIBXML2
0014 #include <libxml/HTMLparser.h>
0015 #include <libxml/xpath.h>
0016 #endif
0017 
0018 using namespace KItinerary;
0019 
0020 namespace KItinerary {
0021 class HtmlDocumentPrivate {
0022 public:
0023 #if HAVE_LIBXML2
0024     ~HtmlDocumentPrivate() {
0025         xmlFreeDoc(m_doc);
0026     }
0027 
0028     xmlDocPtr m_doc;
0029     QByteArray m_rawData;
0030 #endif
0031 };
0032 }
0033 
0034 HtmlElement::HtmlElement()
0035     : d(nullptr)
0036 {
0037 }
0038 
0039 HtmlElement::~HtmlElement() = default;
0040 
0041 #if HAVE_LIBXML2
0042 HtmlElement::HtmlElement(xmlNode *dd)
0043     : d(dd)
0044 {
0045 }
0046 #endif
0047 
0048 HtmlDocument::HtmlDocument(QObject *parent)
0049     : QObject(parent)
0050     , d(new HtmlDocumentPrivate)
0051 {
0052 }
0053 
0054 HtmlDocument::~HtmlDocument() = default;
0055 
0056 bool HtmlElement::isNull() const
0057 {
0058     return d == nullptr;
0059 }
0060 
0061 QString HtmlElement::name() const
0062 {
0063 #if HAVE_LIBXML2
0064     if (d) {
0065         return QString::fromUtf8(reinterpret_cast<const char*>(d->name));
0066     }
0067 #endif
0068     return {};
0069 }
0070 
0071 QString HtmlElement::attribute(const QString &attr) const
0072 {
0073 #if HAVE_LIBXML2
0074     if (d) {
0075         const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlGetProp(d, reinterpret_cast<const xmlChar*>(attr.toUtf8().constData())), xmlFree);
0076         return QString::fromUtf8(reinterpret_cast<const char*>(val.get()));
0077     }
0078 #else
0079     Q_UNUSED(attr)
0080 #endif
0081     return {};
0082 }
0083 
0084 HtmlElement HtmlElement::parent() const
0085 {
0086 #if HAVE_LIBXML2
0087     if (d && d->parent && d->parent->type == XML_ELEMENT_NODE) {
0088         return HtmlElement(d->parent);
0089     }
0090 #endif
0091     return {};
0092 }
0093 
0094 HtmlElement HtmlElement::firstChild() const
0095 {
0096 #if HAVE_LIBXML2
0097     if (d) {
0098         return HtmlElement(xmlFirstElementChild(d));
0099     }
0100 #endif
0101     return {};
0102 }
0103 
0104 HtmlElement HtmlElement::nextSibling() const
0105 {
0106 #if HAVE_LIBXML2
0107     if (d) {
0108         return HtmlElement(xmlNextElementSibling(d));
0109     }
0110 #endif
0111     return {};
0112 }
0113 
0114 #if HAVE_LIBXML2
0115 static void normalizingAppend(QString &out, const QString &in)
0116 {
0117     if (in.isEmpty()) {
0118         return;
0119     }
0120 
0121     const bool needsLeadingSpace = !out.isEmpty() && !out.back().isSpace();
0122     out.reserve(out.size() + in.size() + (needsLeadingSpace ? 1 : 0));
0123     if (needsLeadingSpace) {
0124         out.push_back(QChar::Space);
0125     }
0126 
0127     // convert non-breaking spaces and windows line break to normal ones, technically not correct
0128     // but way too often this confuses our regular expressions
0129     bool leadingTrim = true;
0130     bool foundCR = false;
0131     for (const auto c : in) {
0132         // trim leading spaces while we are at it
0133         if (leadingTrim && c.isSpace()) {
0134             continue;
0135         }
0136         leadingTrim = false;
0137 
0138         // normalize CRs
0139         if (c == QChar::CarriageReturn) {
0140             foundCR = true;
0141             continue;
0142         }
0143         if (foundCR && c != QChar::LineFeed) {
0144             out.push_back(QChar::LineFeed);
0145         }
0146         foundCR = false;
0147 
0148         // normalize space variations
0149         if (c == QChar::Nbsp) {
0150             out.push_back(QChar::Space);
0151         } else {
0152             out.push_back(c);
0153         }
0154     }
0155 }
0156 
0157 static void normalizingLineBreakAppend(QString &s)
0158 {
0159     s = s.trimmed();
0160     s.push_back(QChar::LineFeed);
0161 }
0162 #endif
0163 
0164 QString HtmlElement::content() const
0165 {
0166 #if HAVE_LIBXML2
0167     if (!d) {
0168         return {};
0169     }
0170 
0171     QString s;
0172     auto node = d->children;
0173     while (node) {
0174         switch (node->type) {
0175             case XML_TEXT_NODE:
0176             case XML_CDATA_SECTION_NODE:
0177                 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content)));
0178                 break;
0179             case XML_ENTITY_REF_NODE:
0180             {
0181                 const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
0182                 normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get())));
0183                 break;
0184             }
0185             case XML_ELEMENT_NODE:
0186                 if (qstricmp(reinterpret_cast<const char*>(node->name), "br") == 0) {
0187                     s += QLatin1Char('\n');
0188                 }
0189                 break;
0190             default:
0191                 break;
0192 
0193         }
0194         node = node->next;
0195     }
0196 
0197     return s.trimmed(); // trailing trim can be done without copying
0198 #endif
0199     return {};
0200 }
0201 
0202 #if HAVE_LIBXML2
0203 static void recursiveContent(_xmlNode *node, QString &s)
0204 {
0205     switch (node->type) {
0206         case XML_TEXT_NODE:
0207         case XML_CDATA_SECTION_NODE:
0208             normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(node->content)));
0209             return;
0210         case XML_ENTITY_REF_NODE:
0211         {
0212             const auto val = std::unique_ptr<xmlChar, decltype(xmlFree)>(xmlNodeGetContent(node), xmlFree);
0213             normalizingAppend(s, QString::fromUtf8(reinterpret_cast<const char*>(val.get())));
0214             break;
0215         }
0216         case XML_ELEMENT_NODE:
0217         {
0218             if (qstricmp(reinterpret_cast<const char*>(node->name), "style") == 0) {
0219                 return;
0220             } else if (qstricmp(reinterpret_cast<const char*>(node->name), "table") == 0) {
0221                 normalizingLineBreakAppend(s);
0222             }
0223             break;
0224         }
0225         case XML_ATTRIBUTE_NODE:
0226         case XML_COMMENT_NODE:
0227             return;
0228         default:
0229             break;
0230     }
0231 
0232     auto child = node->children;
0233     while (child) {
0234         recursiveContent(child, s);
0235         child = child->next;
0236     }
0237 
0238     if (node->type == XML_ELEMENT_NODE) {
0239         for (const auto elemName : { "br", "p", "tr" }) {
0240             if (qstricmp(reinterpret_cast<const char*>(node->name), elemName) == 0) {
0241                 normalizingLineBreakAppend(s);
0242                 break;
0243             }
0244         }
0245     }
0246 }
0247 #endif
0248 
0249 QString HtmlElement::recursiveContent() const
0250 {
0251 #if HAVE_LIBXML2
0252     if (!d) {
0253         return {};
0254     }
0255 
0256     QString s;
0257     ::recursiveContent(d, s);
0258     return s.trimmed(); // trailing trim can be done without copying
0259 #else
0260     return {};
0261 #endif
0262 }
0263 
0264 QVariant HtmlElement::eval(const QString &xpath) const
0265 {
0266 #if HAVE_LIBXML2
0267     if (!d) {
0268         return {};
0269     }
0270 
0271     const auto ctx = std::unique_ptr<xmlXPathContext, decltype(&xmlXPathFreeContext)>(xmlXPathNewContext(d->doc), &xmlXPathFreeContext);
0272     if (!ctx) {
0273         return {};
0274     }
0275     xmlXPathSetContextNode(d, ctx.get());
0276     const auto xpathObj = std::unique_ptr<xmlXPathObject, decltype(&xmlXPathFreeObject)>(xmlXPathEvalExpression(reinterpret_cast<const xmlChar*>(xpath.toUtf8().constData()), ctx.get()), &xmlXPathFreeObject);
0277     if (!xpathObj) {
0278         return {};
0279     }
0280 
0281     switch (xpathObj->type) {
0282         case XPATH_NODESET:
0283         {
0284             QVariantList l;
0285             if (!xpathObj->nodesetval) {
0286                 return l;
0287             }
0288             l.reserve(xpathObj->nodesetval->nodeNr);
0289             for (int i = 0; i < xpathObj->nodesetval->nodeNr; ++i) {
0290                 l.push_back(QVariant::fromValue<HtmlElement>(xpathObj->nodesetval->nodeTab[i]));
0291             }
0292             return l;
0293         }
0294         case XPATH_BOOLEAN:
0295             return QVariant::fromValue<bool>(xpathObj->boolval);
0296         case XPATH_NUMBER:
0297             return xpathObj->floatval;
0298         case XPATH_STRING:
0299             return QString::fromUtf8(reinterpret_cast<const char*>(xpathObj->stringval));
0300         default:
0301             return {};
0302     }
0303 #else
0304     Q_UNUSED(xpath)
0305 #endif
0306     return {};
0307 }
0308 
0309 bool HtmlElement::hasAttribute(const QString& attr) const
0310 {
0311 #if HAVE_LIBXML2
0312     if (!d) {
0313         return false;
0314     }
0315 
0316     auto attribute = d->properties;
0317     while(attribute)
0318     {
0319         if (qstricmp(attr.toUtf8().constData(), reinterpret_cast<const char*>(attribute->name)) == 0) {
0320             return true;
0321         }
0322         attribute = attribute->next;
0323     }
0324 #else
0325     Q_UNUSED(attr)
0326 #endif
0327     return false;
0328 }
0329 
0330 QStringList HtmlElement::attributes() const
0331 {
0332     QStringList l;
0333 #if HAVE_LIBXML2
0334     if (!d) {
0335         return l;
0336     }
0337 
0338     auto attribute = d->properties;
0339     while(attribute)
0340     {
0341         l.push_back(QString::fromUtf8(reinterpret_cast<const char*>(attribute->name)));
0342         attribute = attribute->next;
0343     }
0344 #endif
0345     return l;
0346 }
0347 
0348 bool HtmlElement::operator==(const HtmlElement &other) const
0349 {
0350     return d == other.d;
0351 }
0352 
0353 
0354 HtmlElement HtmlDocument::root() const
0355 {
0356 #if HAVE_LIBXML2
0357     if (!d->m_doc) {
0358         return {};
0359     }
0360     return HtmlElement(xmlDocGetRootElement(d->m_doc));
0361 #else
0362     return {};
0363 #endif
0364 }
0365 
0366 QString HtmlDocument::rawData() const
0367 {
0368 #if HAVE_LIBXML2
0369     return QString::fromUtf8(d->m_rawData);
0370 #else
0371     return {};
0372 #endif
0373 }
0374 
0375 QVariant HtmlDocument::eval(const QString &xpath) const
0376 {
0377     return root().eval(xpath);
0378 }
0379 
0380 HtmlDocument* HtmlDocument::fromData(const QByteArray &data, QObject *parent)
0381 {
0382 #if HAVE_LIBXML2
0383     auto tree = htmlReadMemory(data.constData(), data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
0384     if (!tree) {
0385         return nullptr;
0386     }
0387 
0388     auto doc = new HtmlDocument(parent);
0389     doc->d->m_doc = tree;
0390     doc->d->m_rawData = data;
0391     return doc;
0392 #else
0393     Q_UNUSED(data)
0394     Q_UNUSED(parent)
0395     return nullptr;
0396 #endif
0397 }
0398 
0399 HtmlDocument* HtmlDocument::fromString(const QString &data, QObject *parent)
0400 {
0401 #if HAVE_LIBXML2
0402     auto utf8Data = data.toUtf8();
0403     auto tree = htmlReadMemory(utf8Data.constData(), utf8Data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
0404     if (!tree) {
0405         return nullptr;
0406     }
0407 
0408     auto doc = new HtmlDocument(parent);
0409     doc->d->m_doc = tree;
0410     doc->d->m_rawData = std::move(utf8Data);
0411     return doc;
0412 #else
0413     Q_UNUSED(data)
0414     Q_UNUSED(parent)
0415     return nullptr;
0416 #endif
0417 }
0418 
0419 #include "moc_htmldocument.cpp"