File indexing completed on 2024-09-08 12:24:36

0001 /*
0002     This file is part of the syndication library
0003     SPDX-FileCopyrightText: 2005 Frank Osterfeld <osterfeld@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #include <rss2/category.h>
0009 #include <rss2/cloud.h>
0010 #include <rss2/document.h>
0011 #include <rss2/image.h>
0012 #include <rss2/item.h>
0013 #include <rss2/textinput.h>
0014 
0015 #include <constants.h>
0016 #include <documentvisitor.h>
0017 #include <tools.h>
0018 
0019 #include <QDomDocument>
0020 #include <QList>
0021 #include <QSet>
0022 #include <QString>
0023 
0024 #include <vector>
0025 
0026 namespace Syndication
0027 {
0028 namespace RSS2
0029 {
0030 class SYNDICATION_NO_EXPORT Document::DocumentPrivate
0031 {
0032 public:
0033     DocumentPrivate()
0034         : itemDescriptionIsCDATA(false)
0035         , itemDescriptionContainsMarkup(false)
0036         , itemDescGuessed(false)
0037         , itemTitleIsCDATA(false)
0038         , itemTitleContainsMarkup(false)
0039         , itemTitlesGuessed(false)
0040     {
0041     }
0042     mutable bool itemDescriptionIsCDATA;
0043     mutable bool itemDescriptionContainsMarkup;
0044     mutable bool itemDescGuessed;
0045     mutable bool itemTitleIsCDATA;
0046     mutable bool itemTitleContainsMarkup;
0047     mutable bool itemTitlesGuessed;
0048 };
0049 
0050 Document::Document(const QDomElement &element)
0051     : SpecificDocument()
0052     , ElementWrapper(element)
0053     , d(new DocumentPrivate)
0054 {
0055 }
0056 
0057 Document Document::fromXML(const QDomDocument &doc)
0058 {
0059     QDomNode channelNode = doc.namedItem(QStringLiteral("rss")).namedItem(QStringLiteral("channel"));
0060 
0061     return Document(channelNode.toElement());
0062 }
0063 
0064 Document::Document()
0065     : SpecificDocument()
0066     , ElementWrapper()
0067     , d(new DocumentPrivate)
0068 {
0069 }
0070 
0071 Document::Document(const Document &other)
0072     : SpecificDocument(other)
0073     , ElementWrapper(other)
0074 {
0075     d = other.d;
0076 }
0077 
0078 Document::~Document()
0079 {
0080 }
0081 
0082 Document &Document::operator=(const Document &other)
0083 {
0084     ElementWrapper::operator=(other);
0085     d = other.d;
0086     return *this;
0087 }
0088 bool Document::isValid() const
0089 {
0090     return !isNull();
0091 }
0092 
0093 QString Document::title() const
0094 {
0095     return extractElementTextNS(QString(), QStringLiteral("title"));
0096 }
0097 
0098 QString Document::link() const
0099 {
0100     return extractElementTextNS(QString(), QStringLiteral("link"));
0101 }
0102 
0103 QString Document::description() const
0104 {
0105     const QString desc = extractElementTextNS(QString(), QStringLiteral("description"));
0106     return normalize(desc);
0107 }
0108 
0109 QString Document::language() const
0110 {
0111     const QString lang = extractElementTextNS(QString(), QStringLiteral("language"));
0112 
0113     if (!lang.isNull()) {
0114         return lang;
0115     } else {
0116         return extractElementTextNS(dublinCoreNamespace(), QStringLiteral("language"));
0117     }
0118 }
0119 
0120 QString Document::copyright() const
0121 {
0122     const QString rights = extractElementTextNS(QString(), QStringLiteral("copyright"));
0123     if (!rights.isNull()) {
0124         return rights;
0125     } else {
0126         // if <copyright> is not provided, use <dc:rights>
0127         return extractElementTextNS(dublinCoreNamespace(), QStringLiteral("rights"));
0128     }
0129 }
0130 
0131 QString Document::managingEditor() const
0132 {
0133     return extractElementTextNS(QString(), QStringLiteral("managingEditor"));
0134 }
0135 
0136 QString Document::webMaster() const
0137 {
0138     return extractElementTextNS(QString(), QStringLiteral("webMaster"));
0139 }
0140 
0141 time_t Document::pubDate() const
0142 {
0143     QString str = extractElementTextNS(QString(), QStringLiteral("pubDate"));
0144 
0145     if (!str.isNull()) {
0146         return parseDate(str, RFCDate);
0147     } else {
0148         // if there is no pubDate, check for dc:date
0149         str = extractElementTextNS(dublinCoreNamespace(), QStringLiteral("date"));
0150         return parseDate(str, ISODate);
0151     }
0152 }
0153 
0154 time_t Document::lastBuildDate() const
0155 {
0156     const QString str = extractElementTextNS(QString(), QStringLiteral("lastBuildDate"));
0157 
0158     return parseDate(str, RFCDate);
0159 }
0160 
0161 QList<Category> Document::categories() const
0162 {
0163     const QList<QDomElement> catNodes = elementsByTagNameNS(QString(), QStringLiteral("category"));
0164 
0165     QList<Category> categories;
0166     categories.reserve(catNodes.count());
0167 
0168     std::transform(catNodes.cbegin(), catNodes.cend(), std::back_inserter(categories), [](const QDomElement &element) {
0169         return Category(element);
0170     });
0171 
0172     return categories;
0173 }
0174 
0175 QString Document::generator() const
0176 {
0177     return extractElementTextNS(QString(), QStringLiteral("generator"));
0178 }
0179 
0180 QString Document::docs() const
0181 {
0182     return extractElementTextNS(QString(), QStringLiteral("docs"));
0183 }
0184 
0185 Cloud Document::cloud() const
0186 {
0187     return Cloud(firstElementByTagNameNS(QString(), QStringLiteral("cloud")));
0188 }
0189 
0190 int Document::ttl() const
0191 {
0192     bool ok;
0193     int c;
0194 
0195     QString text = extractElementTextNS(QString(), QStringLiteral("ttl"));
0196     c = text.toInt(&ok);
0197     return ok ? c : 0;
0198 }
0199 
0200 Image Document::image() const
0201 {
0202     return Image(firstElementByTagNameNS(QString(), QStringLiteral("image")));
0203 }
0204 
0205 TextInput Document::textInput() const
0206 {
0207     TextInput ti = TextInput(firstElementByTagNameNS(QString(), QStringLiteral("textInput")));
0208 
0209     if (!ti.isNull()) {
0210         return ti;
0211     }
0212 
0213     // Netscape's version of RSS 0.91 has textinput, not textInput
0214     return TextInput(firstElementByTagNameNS(QString(), QStringLiteral("textinput")));
0215 }
0216 
0217 QSet<int> Document::skipHours() const
0218 {
0219     QSet<int> skipHours;
0220     QDomElement skipHoursNode = firstElementByTagNameNS(QString(), QStringLiteral("skipHours"));
0221     if (!skipHoursNode.isNull()) {
0222         ElementWrapper skipHoursWrapper(skipHoursNode);
0223         bool ok = false;
0224         const QList<QDomElement> hours = skipHoursWrapper.elementsByTagNameNS(QString(), QStringLiteral("hour"));
0225         for (const auto &element : hours) {
0226             const int h = element.text().toInt(&ok);
0227             if (ok) {
0228                 skipHours.insert(h);
0229             }
0230         }
0231     }
0232 
0233     return skipHours;
0234 }
0235 
0236 QSet<Document::DayOfWeek> Document::skipDays() const
0237 {
0238     QSet<DayOfWeek> skipDays;
0239     QDomElement skipDaysNode = firstElementByTagNameNS(QString(), QStringLiteral("skipDays"));
0240     if (!skipDaysNode.isNull()) {
0241         ElementWrapper skipDaysWrapper(skipDaysNode);
0242         struct DayInfo {
0243             QLatin1String name;
0244             DayOfWeek enumValue;
0245         };
0246         static const std::vector<DayInfo> weekDays = {
0247             {QLatin1String("Monday"), Monday},
0248             {QLatin1String("Tuesday"), Tuesday},
0249             {QLatin1String("Wednesday"), Wednesday},
0250             {QLatin1String("Thursday"), Thursday},
0251             {QLatin1String("Friday"), Friday},
0252             {QLatin1String("Saturday"), Saturday},
0253             {QLatin1String("Sunday"), Sunday},
0254         };
0255 
0256         const QList<QDomElement> days = skipDaysWrapper.elementsByTagNameNS(QString(), QStringLiteral("day"));
0257         for (const auto &element : days) {
0258             const QString day = element.text();
0259             auto it = std::find_if(weekDays.cbegin(), weekDays.cend(), [&day](const DayInfo &info) {
0260                 return info.name == day;
0261             });
0262             if (it != weekDays.cend()) {
0263                 skipDays.insert(it->enumValue);
0264             }
0265         }
0266     }
0267 
0268     return skipDays;
0269 }
0270 
0271 QList<Item> Document::items() const
0272 {
0273     const QList<QDomElement> itemNodes = elementsByTagNameNS(QString(), QStringLiteral("item"));
0274 
0275     QList<Item> items;
0276     items.reserve(itemNodes.count());
0277 
0278     DocumentPtr doccpy(new Document(*this));
0279 
0280     std::transform(itemNodes.cbegin(), itemNodes.cend(), std::back_inserter(items), [&doccpy](const QDomElement &element) {
0281         return Item(element, doccpy);
0282     });
0283 
0284     return items;
0285 }
0286 QList<QDomElement> Document::unhandledElements() const
0287 {
0288     // TODO: do not hardcode this list here
0289     static std::vector<ElementType> handled; // QVector would require a default ctor, and ElementType is too big for QList
0290     if (handled.empty()) {
0291         handled.reserve(22);
0292         handled.push_back(ElementType(QStringLiteral("title")));
0293         handled.push_back(ElementType(QStringLiteral("link")));
0294         handled.push_back(ElementType(QStringLiteral("description")));
0295         handled.push_back(ElementType(QStringLiteral("language")));
0296         handled.push_back(ElementType(QStringLiteral("copyright")));
0297         handled.push_back(ElementType(QStringLiteral("managingEditor")));
0298         handled.push_back(ElementType(QStringLiteral("webMaster")));
0299         handled.push_back(ElementType(QStringLiteral("pubDate")));
0300         handled.push_back(ElementType(QStringLiteral("lastBuildDate")));
0301         handled.push_back(ElementType(QStringLiteral("skipDays")));
0302         handled.push_back(ElementType(QStringLiteral("skipHours")));
0303         handled.push_back(ElementType(QStringLiteral("item")));
0304         handled.push_back(ElementType(QStringLiteral("textinput")));
0305         handled.push_back(ElementType(QStringLiteral("textInput")));
0306         handled.push_back(ElementType(QStringLiteral("image")));
0307         handled.push_back(ElementType(QStringLiteral("ttl")));
0308         handled.push_back(ElementType(QStringLiteral("generator")));
0309         handled.push_back(ElementType(QStringLiteral("docs")));
0310         handled.push_back(ElementType(QStringLiteral("cloud")));
0311         handled.push_back(ElementType(QStringLiteral("language"), dublinCoreNamespace()));
0312         handled.push_back(ElementType(QStringLiteral("rights"), dublinCoreNamespace()));
0313         handled.push_back(ElementType(QStringLiteral("date"), dublinCoreNamespace()));
0314     }
0315 
0316     QList<QDomElement> notHandled;
0317 
0318     QDomNodeList children = element().childNodes();
0319     const int numChildren = children.size();
0320     for (int i = 0; i < numChildren; ++i) {
0321         QDomElement el = children.at(i).toElement();
0322         if (!el.isNull() //
0323             && std::find(handled.cbegin(), handled.cend(), ElementType(el.localName(), el.namespaceURI())) == handled.cend()) {
0324             notHandled.append(el);
0325         }
0326     }
0327 
0328     return notHandled;
0329 }
0330 
0331 QString Document::debugInfo() const
0332 {
0333     QString info;
0334     info += QLatin1String("### Document: ###################\n");
0335     if (!title().isNull()) {
0336         info += QLatin1String("title: #") + title() + QLatin1String("#\n");
0337     }
0338     if (!description().isNull()) {
0339         info += QLatin1String("description: #") + description() + QLatin1String("#\n");
0340     }
0341     if (!link().isNull()) {
0342         info += QLatin1String("link: #") + link() + QLatin1String("#\n");
0343     }
0344     if (!language().isNull()) {
0345         info += QLatin1String("language: #") + language() + QLatin1String("#\n");
0346     }
0347     if (!copyright().isNull()) {
0348         info += QLatin1String("copyright: #") + copyright() + QLatin1String("#\n");
0349     }
0350     if (!managingEditor().isNull()) {
0351         info += QLatin1String("managingEditor: #") + managingEditor() + QLatin1String("#\n");
0352     }
0353     if (!webMaster().isNull()) {
0354         info += QLatin1String("webMaster: #") + webMaster() + QLatin1String("#\n");
0355     }
0356 
0357     QString dpubdate = dateTimeToString(pubDate());
0358     if (!dpubdate.isNull()) {
0359         info += QLatin1String("pubDate: #") + dpubdate + QLatin1String("#\n");
0360     }
0361 
0362     QString dlastbuilddate = dateTimeToString(lastBuildDate());
0363     if (!dlastbuilddate.isNull()) {
0364         info += QLatin1String("lastBuildDate: #") + dlastbuilddate + QLatin1String("#\n");
0365     }
0366 
0367     if (!textInput().isNull()) {
0368         info += textInput().debugInfo();
0369     }
0370     if (!cloud().isNull()) {
0371         info += cloud().debugInfo();
0372     }
0373     if (!image().isNull()) {
0374         info += image().debugInfo();
0375     }
0376 
0377     const QList<Category> cats = categories();
0378 
0379     for (const auto &c : cats) {
0380         info += c.debugInfo();
0381     }
0382 
0383     const QList<Item> litems = items();
0384     for (const auto &item : litems) {
0385         info += item.debugInfo();
0386     }
0387     info += QLatin1String("### Document end ################\n");
0388     return info;
0389 }
0390 
0391 void Document::getItemTitleFormatInfo(bool *isCDATA, bool *containsMarkup) const
0392 {
0393     if (!d->itemTitlesGuessed) {
0394         QString titles;
0395         QList<Item> litems = items();
0396 
0397         if (litems.isEmpty()) {
0398             d->itemTitlesGuessed = true;
0399             return;
0400         }
0401 
0402         QDomElement titleEl = (*litems.begin()).firstElementByTagNameNS(QString(), QStringLiteral("title"));
0403         d->itemTitleIsCDATA = titleEl.firstChild().isCDATASection();
0404 
0405         const int nmax = std::min<int>(litems.size(), 10); // we check a maximum of 10 items
0406         int i = 0;
0407 
0408         for (const auto &item : litems) {
0409             if (i++ >= nmax) {
0410                 break;
0411             }
0412             titles += item.originalTitle();
0413         }
0414 
0415         d->itemTitleContainsMarkup = stringContainsMarkup(titles);
0416         d->itemTitlesGuessed = true;
0417     }
0418 
0419     if (isCDATA != nullptr) {
0420         *isCDATA = d->itemTitleIsCDATA;
0421     }
0422     if (containsMarkup != nullptr) {
0423         *containsMarkup = d->itemTitleContainsMarkup;
0424     }
0425 }
0426 
0427 void Document::getItemDescriptionFormatInfo(bool *isCDATA, bool *containsMarkup) const
0428 {
0429     if (!d->itemDescGuessed) {
0430         QString desc;
0431         QList<Item> litems = items();
0432 
0433         if (litems.isEmpty()) {
0434             d->itemDescGuessed = true;
0435             return;
0436         }
0437 
0438         QDomElement descEl = (*litems.begin()).firstElementByTagNameNS(QString(), QStringLiteral("description"));
0439         d->itemDescriptionIsCDATA = descEl.firstChild().isCDATASection();
0440 
0441         const int nmax = std::min<int>(litems.size(), 10); // we check a maximum of 10 items
0442         int i = 0;
0443 
0444         for (const auto &item : litems) {
0445             if (i++ >= nmax) {
0446                 break;
0447             }
0448             desc += item.originalDescription();
0449         }
0450 
0451         d->itemDescriptionContainsMarkup = stringContainsMarkup(desc);
0452         d->itemDescGuessed = true;
0453     }
0454 
0455     if (isCDATA != nullptr) {
0456         *isCDATA = d->itemDescriptionIsCDATA;
0457     }
0458     if (containsMarkup != nullptr) {
0459         *containsMarkup = d->itemDescriptionContainsMarkup;
0460     }
0461 }
0462 
0463 bool Document::accept(DocumentVisitor *visitor)
0464 {
0465     return visitor->visitRSS2Document(this);
0466 }
0467 
0468 } // namespace RSS2
0469 } // namespace Syndication