File indexing completed on 2024-04-14 03:58:30

0001 /*
0002     This file is part of the syndication library
0003     SPDX-FileCopyrightText: 2019 Laurent Montel <montel@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #include "loaderutil_p.h"
0009 #include <QDebug>
0010 #include <QRegularExpression>
0011 
0012 //#define DEBUG_PARSING_FEED
0013 #ifdef DEBUG_PARSING_FEED
0014 #include <QFile>
0015 #include <QTextStream>
0016 #endif
0017 QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url)
0018 {
0019 #ifdef DEBUG_PARSING_FEED
0020     qDebug() << " QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url)";
0021     QFile headerFile(QStringLiteral("/tmp/bb.txt"));
0022     headerFile.open(QIODevice::WriteOnly | QIODevice::Text);
0023     QTextStream outHeaderStream(&headerFile);
0024     outHeaderStream << data;
0025     headerFile.close();
0026 #endif
0027     QUrl discoveredFeedURL;
0028     QString str = QString::fromLatin1(data.constData()).simplified();
0029     QString s2;
0030     // QTextStream ts( &str, QIODevice::WriteOnly );
0031     // ts << data.data();
0032 
0033     // "<[\\s]link[^>]*rel[\\s]=[\\s]\\\"[\\s]alternate[\\s]\\\"[^>]*>"
0034     // "type[\\s]=[\\s]\\\"application/rss+xml\\\""
0035     // "href[\\s]=[\\s]\\\"application/rss+xml\\\""
0036 
0037     QRegularExpression rx(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*"
0038                                          "[\\s]*type[^=]*=\"application/rss\\+xml\"[^s][^s](?:[^>]*)"
0039                                          "[\\s]*[\\s]*[^s]*(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"),
0040                           QRegularExpression::CaseInsensitiveOption);
0041     QRegularExpressionMatch match;
0042     if ((match = rx.match(str)).hasMatch()) {
0043         s2 = match.captured(1);
0044     } else {
0045         const QRegularExpression rx2(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)"
0046                                                     "[\\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"),
0047                                      QRegularExpression::CaseInsensitiveOption);
0048         if ((match = rx2.match(str)).hasMatch()) {
0049             s2 = match.captured(1);
0050         } else {
0051             // does not support Atom/RSS autodiscovery.. try finding feeds by brute force....
0052             QStringList feeds;
0053             QString host = url.host();
0054             rx.setPattern(QStringLiteral("(?:<A )[^H]*(?:HREF)[^=]*=[^A-Z0-9-_~,./]*([^'\">\\s]*)"));
0055             QRegularExpressionMatchIterator iter = rx.globalMatch(str);
0056             while (iter.hasNext()) {
0057                 match = iter.next();
0058                 s2 = match.captured(1);
0059                 if (s2.endsWith(QLatin1String(".rdf")) //
0060                     || s2.endsWith(QLatin1String(".rss")) //
0061                     || s2.endsWith(QLatin1String(".xml"))) {
0062                     feeds.append(s2);
0063                 }
0064             }
0065 
0066             // Prefer feeds on same host
0067             auto it = std::find_if(feeds.cbegin(), feeds.cend(), [&host](const QString &s) {
0068                 return QUrl(s).host() == host;
0069             });
0070             if (it != feeds.cend()) {
0071                 s2 = *it;
0072             }
0073         }
0074     }
0075 
0076     if (s2.isNull()) {
0077         return discoveredFeedURL;
0078     }
0079 
0080     if (QUrl(s2).isRelative()) {
0081         if (s2.startsWith(QLatin1String("//"))) {
0082             s2.prepend(url.scheme() + QLatin1Char(':'));
0083             discoveredFeedURL = QUrl(s2);
0084         } else if (s2.startsWith(QLatin1Char('/'))) {
0085             discoveredFeedURL = url;
0086             discoveredFeedURL.setPath(s2);
0087         } else {
0088             discoveredFeedURL = url;
0089             discoveredFeedURL.setPath(discoveredFeedURL.path() + QLatin1Char('/') + s2);
0090         }
0091     } else {
0092         discoveredFeedURL = QUrl(s2);
0093     }
0094 
0095     return discoveredFeedURL;
0096 }