File indexing completed on 2025-04-27 06:57:32
0001 /* 0002 This file is part of the syndication library 0003 SPDX-FileCopyrightText: 2019 Laurent Montel <montel@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "loaderutil_p.h" 0009 #include <QDebug> 0010 #include <QRegularExpression> 0011 0012 //#define DEBUG_PARSING_FEED 0013 #ifdef DEBUG_PARSING_FEED 0014 #include <QFile> 0015 #include <QTextStream> 0016 #endif 0017 QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url) 0018 { 0019 #ifdef DEBUG_PARSING_FEED 0020 qDebug() << " QUrl Syndication::LoaderUtil::parseFeed(const QByteArray &data, const QUrl &url)"; 0021 QFile headerFile(QStringLiteral("/tmp/bb.txt")); 0022 headerFile.open(QIODevice::WriteOnly | QIODevice::Text); 0023 QTextStream outHeaderStream(&headerFile); 0024 outHeaderStream << data; 0025 headerFile.close(); 0026 #endif 0027 QUrl discoveredFeedURL; 0028 QString str = QString::fromLatin1(data.constData()).simplified(); 0029 QString s2; 0030 // QTextStream ts( &str, QIODevice::WriteOnly ); 0031 // ts << data.data(); 0032 0033 // "<[\\s]link[^>]*rel[\\s]=[\\s]\\\"[\\s]alternate[\\s]\\\"[^>]*>" 0034 // "type[\\s]=[\\s]\\\"application/rss+xml\\\"" 0035 // "href[\\s]=[\\s]\\\"application/rss+xml\\\"" 0036 0037 QRegularExpression rx(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[^sAa]*" 0038 "[\\s]*type[^=]*=\"application/rss\\+xml\"[^s][^s](?:[^>]*)" 0039 "[\\s]*[\\s]*[^s]*(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"), 0040 QRegularExpression::CaseInsensitiveOption); 0041 QRegularExpressionMatch match; 0042 if ((match = rx.match(str)).hasMatch()) { 0043 s2 = match.captured(1); 0044 } else { 0045 const QRegularExpression rx2(QStringLiteral("(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)" 0046 "[\\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)"), 0047 QRegularExpression::CaseInsensitiveOption); 0048 if ((match = rx2.match(str)).hasMatch()) { 0049 s2 = match.captured(1); 0050 } else { 0051 // does not support Atom/RSS autodiscovery.. try finding feeds by brute force.... 0052 QStringList feeds; 0053 QString host = url.host(); 0054 rx.setPattern(QStringLiteral("(?:<A )[^H]*(?:HREF)[^=]*=[^A-Z0-9-_~,./]*([^'\">\\s]*)")); 0055 QRegularExpressionMatchIterator iter = rx.globalMatch(str); 0056 while (iter.hasNext()) { 0057 match = iter.next(); 0058 s2 = match.captured(1); 0059 if (s2.endsWith(QLatin1String(".rdf")) // 0060 || s2.endsWith(QLatin1String(".rss")) // 0061 || s2.endsWith(QLatin1String(".xml"))) { 0062 feeds.append(s2); 0063 } 0064 } 0065 0066 // Prefer feeds on same host 0067 auto it = std::find_if(feeds.cbegin(), feeds.cend(), [&host](const QString &s) { 0068 return QUrl(s).host() == host; 0069 }); 0070 if (it != feeds.cend()) { 0071 s2 = *it; 0072 } 0073 } 0074 } 0075 0076 if (s2.isNull()) { 0077 return discoveredFeedURL; 0078 } 0079 0080 if (QUrl(s2).isRelative()) { 0081 if (s2.startsWith(QLatin1String("//"))) { 0082 s2.prepend(url.scheme() + QLatin1Char(':')); 0083 discoveredFeedURL = QUrl(s2); 0084 } else if (s2.startsWith(QLatin1Char('/'))) { 0085 discoveredFeedURL = url; 0086 discoveredFeedURL.setPath(s2); 0087 } else { 0088 discoveredFeedURL = url; 0089 discoveredFeedURL.setPath(discoveredFeedURL.path() + QLatin1Char('/') + s2); 0090 } 0091 } else { 0092 discoveredFeedURL = QUrl(s2); 0093 } 0094 0095 return discoveredFeedURL; 0096 }