File indexing completed on 2025-01-05 04:35:08

0001 /* ============================================================
0002 * TLDExtractor, a simple Qt interface to extract TLD part of a host
0003 * Copyright (C) 2014  Razi Alavizadeh <s.r.alavizadeh@gmail.com>
0004 *
0005 * This program is free software: you can redistribute it and/or modify
0006 * it under the terms of the GNU General Public License as published by
0007 * the Free Software Foundation, either version 3 of the License, or
0008 * (at your option) any later version.
0009 *
0010 * This program is distributed in the hope that it will be useful,
0011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0013 * GNU General Public License for more details.
0014 *
0015 * You should have received a copy of the GNU General Public License
0016 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
0017 * ============================================================ */
0018 #include "tldextractor.h"
0019 
0020 #include <QApplication>
0021 #include <QDebug>
0022 #include <QFileInfo>
0023 #include <QMessageBox>
0024 #include <QUrl>
0025 #include <QRegExp>
0026 
0027 TLDExtractor* TLDExtractor::s_instance = nullptr;
0028 
0029 TLDExtractor::TLDExtractor(QObject* parent)
0030     : QObject(parent)
0031 {
0032     setDataSearchPaths();
0033 }
0034 
0035 QStringList TLDExtractor::defaultDataSearchPaths()
0036 {
0037     return QStringList() << QLatin1String(":/tldextractor/data");
0038 }
0039 
0040 TLDExtractor* TLDExtractor::instance()
0041 {
0042     if(s_instance == nullptr)
0043     {
0044         s_instance = new TLDExtractor(qApp);
0045     }
0046 
0047     return s_instance;
0048 }
0049 
0050 TLDExtractor::~TLDExtractor()
0051 {
0052     s_instance = nullptr;
0053 }
0054 
0055 bool TLDExtractor::isDataLoaded()
0056 {
0057     return !m_tldHash.isEmpty();
0058 }
0059 
0060 QString TLDExtractor::TLD(const QString &host)
0061 {
0062     if (host.isEmpty() || host.startsWith(QLatin1Char('.'))) {
0063         return {};
0064     }
0065 
0066     QString cleanHost = normalizedHost(host);
0067 
0068     QString tldPart = cleanHost.mid(cleanHost.lastIndexOf(QLatin1Char('.')) + 1);
0069     cleanHost = QString::fromUtf8(QUrl::toAce(cleanHost));
0070 
0071     loadData();
0072 
0073     if (!m_tldHash.contains(tldPart)) {
0074         return tldPart;
0075     }
0076 
0077     QStringList tldRules = m_tldHash.values(tldPart);
0078 
0079     if (!tldRules.contains(tldPart)) {
0080         tldRules << tldPart;
0081     }
0082 
0083     int maxLabelCount = 0;
0084     bool isExceptionTLD = false;
0085     bool isWildcardTLD = false;
0086 
0087     for (QString rule : std::as_const(tldRules)) {
0088         const int labelCount = rule.count(QLatin1Char('.')) + 1;
0089 
0090         if (rule.startsWith(QLatin1Char('!'))) {
0091             rule.remove(0, 1);
0092 
0093             rule = QString::fromUtf8(QUrl::toAce(rule));
0094             isExceptionTLD = true;
0095 
0096             // matches with exception TLD
0097             if (cleanHost.endsWith(rule)) {
0098                 tldPart = rule.mid(rule.indexOf(QLatin1Char('.')) + 1);
0099                 break;
0100             }
0101         }
0102         else {
0103             isExceptionTLD = false;
0104         }
0105 
0106         if (rule.startsWith(QLatin1Char('*'))) {
0107             rule.remove(0, 1);
0108 
0109             if (rule.startsWith(QLatin1Char('.'))) {
0110                 rule.remove(0, 1);
0111             }
0112 
0113             isWildcardTLD = true;
0114         }
0115         else {
0116             isWildcardTLD = false;
0117         }
0118 
0119         Q_UNUSED(isExceptionTLD)
0120 
0121         rule = QString::fromUtf8(QUrl::toAce(rule));
0122         const QString testRule = QLatin1Char('.') + rule;
0123         const QString testUrl = QLatin1Char('.') + cleanHost;
0124 
0125         if (labelCount > maxLabelCount && testUrl.endsWith(testRule)) {
0126             tldPart = rule;
0127             maxLabelCount = labelCount;
0128 
0129             if (isWildcardTLD) {
0130                 QString temp = cleanHost;
0131                 temp.remove(temp.lastIndexOf(tldPart), tldPart.size());
0132 
0133                 if (temp.endsWith(QLatin1Char('.'))) {
0134                     temp.remove(temp.size() - 1, 1);
0135                 }
0136 
0137                 temp = temp.mid(temp.lastIndexOf(QLatin1Char('.')) + 1);
0138 
0139                 tldPart = temp.isEmpty() ? rule : (temp + QLatin1Char('.') + rule);
0140             }
0141         }
0142     }
0143 
0144     QString temp = normalizedHost(host);
0145     tldPart = temp.section(QLatin1Char('.'), temp.count(QLatin1Char('.')) - tldPart.count(QLatin1Char('.')));
0146 
0147     return tldPart;
0148 }
0149 
0150 QString TLDExtractor::domain(const QString &host)
0151 {
0152     const QString tldPart = TLD(host);
0153 
0154     return domainHelper(host, tldPart);
0155 }
0156 
0157 QString TLDExtractor::domainHelper(const QString &host, const QString &tldPart)
0158 {
0159     if (host.isEmpty() || tldPart.isEmpty()) {
0160         return {};
0161     }
0162 
0163     QString temp = normalizedHost(host);
0164     temp.remove(temp.lastIndexOf(tldPart), tldPart.size());
0165 
0166     if (temp.endsWith(QLatin1Char('.'))) {
0167         temp.remove(temp.size() - 1, 1);
0168     }
0169 
0170     return temp.mid(temp.lastIndexOf(QLatin1Char('.')) + 1);
0171 }
0172 
0173 QString TLDExtractor::registrableDomainHelper(const QString &domainPart, const QString &tldPart)
0174 {
0175     if (tldPart.isEmpty() || domainPart.isEmpty()) {
0176         return {};
0177     }
0178     else {
0179         return QStringLiteral("%1.%2").arg(domainPart, tldPart);
0180     }
0181 }
0182 
0183 QString TLDExtractor::subdomainHelper(const QString &host, const QString &registrablePart)
0184 {
0185     if (!registrablePart.isEmpty()) {
0186         QString subdomain = normalizedHost(host);
0187 
0188         subdomain.remove(subdomain.lastIndexOf(registrablePart), registrablePart.size());
0189 
0190         if (subdomain.endsWith(QLatin1Char('.'))) {
0191             subdomain.remove(subdomain.size() - 1, 1);
0192         }
0193 
0194         return subdomain;
0195     }
0196 
0197     return {};
0198 }
0199 
0200 QString TLDExtractor::registrableDomain(const QString &host)
0201 {
0202     const QString tldPart = TLD(host);
0203 
0204     return registrableDomainHelper(domainHelper(host, tldPart), tldPart);
0205 }
0206 
0207 QString TLDExtractor::subdomain(const QString &host)
0208 {
0209     return subdomainHelper(host, registrableDomain(host));
0210 }
0211 
0212 // a light function that extract all parts with just one call to TLD()
0213 TLDExtractor::HostParts TLDExtractor::splitParts(const QString &host)
0214 {
0215     HostParts hostParts;
0216 
0217     hostParts.host = host;
0218     hostParts.tld = TLD(host);
0219     hostParts.domain = domainHelper(host, hostParts.tld);
0220     hostParts.registrableDomain = registrableDomainHelper(hostParts.domain, hostParts.tld);
0221     hostParts.subdomain = subdomainHelper(host, hostParts.registrableDomain);
0222 
0223     return hostParts;
0224 }
0225 
0226 QStringList TLDExtractor::dataSearchPaths() const
0227 {
0228     return m_dataSearchPaths;
0229 }
0230 
0231 void TLDExtractor::setDataSearchPaths(const QStringList &searchPaths)
0232 {
0233     m_dataSearchPaths = searchPaths;
0234 
0235     m_dataSearchPaths << TLDExtractor::defaultDataSearchPaths();
0236 
0237     m_dataSearchPaths.removeDuplicates();
0238 }
0239 
0240 void TLDExtractor::loadData()
0241 {
0242     if (isDataLoaded()) {
0243         return;
0244     }
0245 
0246     QString dataFileName;
0247     bool parsedDataFileExist = false;
0248 
0249     for (const QString &path : std::as_const(m_dataSearchPaths)) {
0250         dataFileName = QFileInfo(path + QLatin1String("/effective_tld_names.dat")).absoluteFilePath();
0251 
0252         if (QFileInfo(dataFileName).exists()) {
0253             parsedDataFileExist = true;
0254             break;
0255         }
0256     }
0257 
0258 
0259     if (!parsedDataFileExist) {
0260         const QString tldDataFileDownloadLink = QLatin1String("http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1");
0261         QMessageBox::information(nullptr, tr("File not found!"),
0262                                  tr("File \'effective_tld_names.dat\' was not found!\n"
0263                                     "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2")
0264                                  .arg(tldDataFileDownloadLink, m_dataSearchPaths.join(QStringLiteral("\n"))));
0265 
0266         return;
0267     }
0268 
0269     m_dataFileName = dataFileName;
0270 
0271     if (!parseData(dataFileName)) {
0272         qWarning() << "TLDExtractor: There is some parse errors for file:" << dataFileName;
0273     }
0274 }
0275 
0276 bool TLDExtractor::parseData(const QString &dataFile, bool loadPrivateDomains)
0277 {
0278     m_tldHash.clear();
0279 
0280     QFile file(dataFile);
0281 
0282     if (!file.open(QFile::ReadOnly | QFile::Text)) {
0283         return false;
0284     }
0285 
0286     bool seekToEndOfPrivateDomains = false;
0287 
0288     while (!file.atEnd()) {
0289         QString line = QString::fromUtf8(file.readLine().constData()).simplified();
0290 
0291         if (line.isEmpty()) {
0292             continue;
0293         }
0294 
0295         if (line.startsWith(QLatin1Char('.'))) {
0296             line.remove(0, 1);
0297         }
0298 
0299 
0300         if (line.startsWith(QLatin1String("//"))) {
0301             if (line.contains(QLatin1String("===END PRIVATE DOMAINS==="))) {
0302                 seekToEndOfPrivateDomains = false;
0303             }
0304 
0305             if (!loadPrivateDomains && line.contains(QLatin1String("===BEGIN PRIVATE DOMAINS==="))) {
0306                 if (m_tldHash.isEmpty()) {
0307                     seekToEndOfPrivateDomains = true;
0308                 }
0309                 else {
0310                     break;
0311                 }
0312             }
0313 
0314             continue;
0315         }
0316 
0317         if (seekToEndOfPrivateDomains) {
0318             continue;
0319         }
0320 
0321         // Each line is only read up to the first whitespace
0322         line = line.left(line.indexOf(QLatin1Char(' ')));
0323 
0324         if (!line.contains(QLatin1Char('.'))) {
0325             m_tldHash.insert(line, line);
0326         }
0327         else {
0328             QString key = line.mid(line.lastIndexOf(QLatin1Char('.')) + 1);
0329 
0330             m_tldHash.insert(key, line);
0331         }
0332     }
0333 
0334     return isDataLoaded();
0335 }
0336 
0337 QString TLDExtractor::normalizedHost(const QString &host) const
0338 {
0339     return host.toLower();
0340 }
0341 
0342 // methods for testing
0343 bool TLDExtractor::test()
0344 {
0345     if (!parseData(m_dataFileName, true)) {
0346         return false;
0347     }
0348 
0349     QString testDataFileName;
0350     bool testDataFileExist = false;
0351 
0352     for (const QString &path : std::as_const(m_dataSearchPaths)) {
0353         testDataFileName = QFileInfo(path + QLatin1String("/test_psl.txt")).absoluteFilePath();
0354 
0355         if (QFileInfo(testDataFileName).exists()) {
0356             testDataFileExist = true;
0357             break;
0358         }
0359     }
0360 
0361     if (!testDataFileExist) {
0362         const QString testFileDownloadLink = QLatin1String("http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1");
0363 
0364         QMessageBox::information(nullptr, tr("File not found!"),
0365                                  tr("File \'test_psl.txt\' was not found!\n"
0366                                     "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2")
0367                                  .arg(testFileDownloadLink, m_dataSearchPaths.join(QStringLiteral("\n"))));
0368 
0369         return false;
0370     }
0371 
0372     QFile file(testDataFileName);
0373 
0374     if (!file.open(QFile::ReadOnly | QFile::Text)) {
0375         return false;
0376     }
0377 
0378     QRegExp testRegExp(QStringLiteral("checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);"));
0379     bool allTestSuccess = true;
0380 
0381     while (!file.atEnd()) {
0382         QString line = QString::fromUtf8(file.readLine().constData()).simplified();
0383 
0384         if (line.startsWith(QLatin1String("//")) || line.isEmpty()) {
0385             continue;
0386         }
0387 
0388         testRegExp.indexIn(line);
0389 
0390         const QString hostName = testRegExp.cap(2);
0391         const QString registrableName = testRegExp.cap(4);
0392 
0393         if (!checkPublicSuffix(hostName, registrableName)) {
0394             allTestSuccess = false;
0395         }
0396     }
0397 
0398     if (allTestSuccess) {
0399         qWarning() << "TLDExtractor: Test passed successfully.";
0400     }
0401     else {
0402         qWarning() << "TLDExtractor: Test finished with some errors!";
0403     }
0404 
0405     // reset cache for normal use
0406     m_tldHash.clear();
0407 
0408     return allTestSuccess;
0409 }
0410 
0411 bool TLDExtractor::checkPublicSuffix(const QString &hostName, const QString &registrableName)
0412 {
0413     if (registrableDomain(hostName) != registrableName) {
0414         qWarning() << "TLDExtractor Test Error: hostName:" << hostName
0415                    << "Correct registrableName:" << registrableName
0416                    << "Wrong registrableName:" << registrableDomain(hostName);
0417 
0418         return false;
0419     }
0420 
0421     return true;
0422 }