File indexing completed on 2025-01-05 04:35:08
0001 /* ============================================================ 0002 * TLDExtractor, a simple Qt interface to extract TLD part of a host 0003 * Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com> 0004 * 0005 * This program is free software: you can redistribute it and/or modify 0006 * it under the terms of the GNU General Public License as published by 0007 * the Free Software Foundation, either version 3 of the License, or 0008 * (at your option) any later version. 0009 * 0010 * This program is distributed in the hope that it will be useful, 0011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 0013 * GNU General Public License for more details. 0014 * 0015 * You should have received a copy of the GNU General Public License 0016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 0017 * ============================================================ */ 0018 #include "tldextractor.h" 0019 0020 #include <QApplication> 0021 #include <QDebug> 0022 #include <QFileInfo> 0023 #include <QMessageBox> 0024 #include <QUrl> 0025 #include <QRegExp> 0026 0027 TLDExtractor* TLDExtractor::s_instance = nullptr; 0028 0029 TLDExtractor::TLDExtractor(QObject* parent) 0030 : QObject(parent) 0031 { 0032 setDataSearchPaths(); 0033 } 0034 0035 QStringList TLDExtractor::defaultDataSearchPaths() 0036 { 0037 return QStringList() << QLatin1String(":/tldextractor/data"); 0038 } 0039 0040 TLDExtractor* TLDExtractor::instance() 0041 { 0042 if(s_instance == nullptr) 0043 { 0044 s_instance = new TLDExtractor(qApp); 0045 } 0046 0047 return s_instance; 0048 } 0049 0050 TLDExtractor::~TLDExtractor() 0051 { 0052 s_instance = nullptr; 0053 } 0054 0055 bool TLDExtractor::isDataLoaded() 0056 { 0057 return !m_tldHash.isEmpty(); 0058 } 0059 0060 QString TLDExtractor::TLD(const QString &host) 0061 { 0062 if (host.isEmpty() || host.startsWith(QLatin1Char('.'))) { 0063 return {}; 0064 } 0065 0066 QString cleanHost = normalizedHost(host); 0067 0068 QString tldPart = cleanHost.mid(cleanHost.lastIndexOf(QLatin1Char('.')) + 1); 0069 cleanHost = QString::fromUtf8(QUrl::toAce(cleanHost)); 0070 0071 loadData(); 0072 0073 if (!m_tldHash.contains(tldPart)) { 0074 return tldPart; 0075 } 0076 0077 QStringList tldRules = m_tldHash.values(tldPart); 0078 0079 if (!tldRules.contains(tldPart)) { 0080 tldRules << tldPart; 0081 } 0082 0083 int maxLabelCount = 0; 0084 bool isExceptionTLD = false; 0085 bool isWildcardTLD = false; 0086 0087 for (QString rule : std::as_const(tldRules)) { 0088 const int labelCount = rule.count(QLatin1Char('.')) + 1; 0089 0090 if (rule.startsWith(QLatin1Char('!'))) { 0091 rule.remove(0, 1); 0092 0093 rule = QString::fromUtf8(QUrl::toAce(rule)); 0094 isExceptionTLD = true; 0095 0096 // matches with exception TLD 0097 if (cleanHost.endsWith(rule)) { 0098 tldPart = rule.mid(rule.indexOf(QLatin1Char('.')) + 1); 0099 break; 0100 } 0101 } 0102 else { 0103 isExceptionTLD = false; 0104 } 0105 0106 if (rule.startsWith(QLatin1Char('*'))) { 0107 rule.remove(0, 1); 0108 0109 if (rule.startsWith(QLatin1Char('.'))) { 0110 rule.remove(0, 1); 0111 } 0112 0113 isWildcardTLD = true; 0114 } 0115 else { 0116 isWildcardTLD = false; 0117 } 0118 0119 Q_UNUSED(isExceptionTLD) 0120 0121 rule = QString::fromUtf8(QUrl::toAce(rule)); 0122 const QString testRule = QLatin1Char('.') + rule; 0123 const QString testUrl = QLatin1Char('.') + cleanHost; 0124 0125 if (labelCount > maxLabelCount && testUrl.endsWith(testRule)) { 0126 tldPart = rule; 0127 maxLabelCount = labelCount; 0128 0129 if (isWildcardTLD) { 0130 QString temp = cleanHost; 0131 temp.remove(temp.lastIndexOf(tldPart), tldPart.size()); 0132 0133 if (temp.endsWith(QLatin1Char('.'))) { 0134 temp.remove(temp.size() - 1, 1); 0135 } 0136 0137 temp = temp.mid(temp.lastIndexOf(QLatin1Char('.')) + 1); 0138 0139 tldPart = temp.isEmpty() ? rule : (temp + QLatin1Char('.') + rule); 0140 } 0141 } 0142 } 0143 0144 QString temp = normalizedHost(host); 0145 tldPart = temp.section(QLatin1Char('.'), temp.count(QLatin1Char('.')) - tldPart.count(QLatin1Char('.'))); 0146 0147 return tldPart; 0148 } 0149 0150 QString TLDExtractor::domain(const QString &host) 0151 { 0152 const QString tldPart = TLD(host); 0153 0154 return domainHelper(host, tldPart); 0155 } 0156 0157 QString TLDExtractor::domainHelper(const QString &host, const QString &tldPart) 0158 { 0159 if (host.isEmpty() || tldPart.isEmpty()) { 0160 return {}; 0161 } 0162 0163 QString temp = normalizedHost(host); 0164 temp.remove(temp.lastIndexOf(tldPart), tldPart.size()); 0165 0166 if (temp.endsWith(QLatin1Char('.'))) { 0167 temp.remove(temp.size() - 1, 1); 0168 } 0169 0170 return temp.mid(temp.lastIndexOf(QLatin1Char('.')) + 1); 0171 } 0172 0173 QString TLDExtractor::registrableDomainHelper(const QString &domainPart, const QString &tldPart) 0174 { 0175 if (tldPart.isEmpty() || domainPart.isEmpty()) { 0176 return {}; 0177 } 0178 else { 0179 return QStringLiteral("%1.%2").arg(domainPart, tldPart); 0180 } 0181 } 0182 0183 QString TLDExtractor::subdomainHelper(const QString &host, const QString ®istrablePart) 0184 { 0185 if (!registrablePart.isEmpty()) { 0186 QString subdomain = normalizedHost(host); 0187 0188 subdomain.remove(subdomain.lastIndexOf(registrablePart), registrablePart.size()); 0189 0190 if (subdomain.endsWith(QLatin1Char('.'))) { 0191 subdomain.remove(subdomain.size() - 1, 1); 0192 } 0193 0194 return subdomain; 0195 } 0196 0197 return {}; 0198 } 0199 0200 QString TLDExtractor::registrableDomain(const QString &host) 0201 { 0202 const QString tldPart = TLD(host); 0203 0204 return registrableDomainHelper(domainHelper(host, tldPart), tldPart); 0205 } 0206 0207 QString TLDExtractor::subdomain(const QString &host) 0208 { 0209 return subdomainHelper(host, registrableDomain(host)); 0210 } 0211 0212 // a light function that extract all parts with just one call to TLD() 0213 TLDExtractor::HostParts TLDExtractor::splitParts(const QString &host) 0214 { 0215 HostParts hostParts; 0216 0217 hostParts.host = host; 0218 hostParts.tld = TLD(host); 0219 hostParts.domain = domainHelper(host, hostParts.tld); 0220 hostParts.registrableDomain = registrableDomainHelper(hostParts.domain, hostParts.tld); 0221 hostParts.subdomain = subdomainHelper(host, hostParts.registrableDomain); 0222 0223 return hostParts; 0224 } 0225 0226 QStringList TLDExtractor::dataSearchPaths() const 0227 { 0228 return m_dataSearchPaths; 0229 } 0230 0231 void TLDExtractor::setDataSearchPaths(const QStringList &searchPaths) 0232 { 0233 m_dataSearchPaths = searchPaths; 0234 0235 m_dataSearchPaths << TLDExtractor::defaultDataSearchPaths(); 0236 0237 m_dataSearchPaths.removeDuplicates(); 0238 } 0239 0240 void TLDExtractor::loadData() 0241 { 0242 if (isDataLoaded()) { 0243 return; 0244 } 0245 0246 QString dataFileName; 0247 bool parsedDataFileExist = false; 0248 0249 for (const QString &path : std::as_const(m_dataSearchPaths)) { 0250 dataFileName = QFileInfo(path + QLatin1String("/effective_tld_names.dat")).absoluteFilePath(); 0251 0252 if (QFileInfo(dataFileName).exists()) { 0253 parsedDataFileExist = true; 0254 break; 0255 } 0256 } 0257 0258 0259 if (!parsedDataFileExist) { 0260 const QString tldDataFileDownloadLink = QLatin1String("http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"); 0261 QMessageBox::information(nullptr, tr("File not found!"), 0262 tr("File \'effective_tld_names.dat\' was not found!\n" 0263 "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2") 0264 .arg(tldDataFileDownloadLink, m_dataSearchPaths.join(QStringLiteral("\n")))); 0265 0266 return; 0267 } 0268 0269 m_dataFileName = dataFileName; 0270 0271 if (!parseData(dataFileName)) { 0272 qWarning() << "TLDExtractor: There is some parse errors for file:" << dataFileName; 0273 } 0274 } 0275 0276 bool TLDExtractor::parseData(const QString &dataFile, bool loadPrivateDomains) 0277 { 0278 m_tldHash.clear(); 0279 0280 QFile file(dataFile); 0281 0282 if (!file.open(QFile::ReadOnly | QFile::Text)) { 0283 return false; 0284 } 0285 0286 bool seekToEndOfPrivateDomains = false; 0287 0288 while (!file.atEnd()) { 0289 QString line = QString::fromUtf8(file.readLine().constData()).simplified(); 0290 0291 if (line.isEmpty()) { 0292 continue; 0293 } 0294 0295 if (line.startsWith(QLatin1Char('.'))) { 0296 line.remove(0, 1); 0297 } 0298 0299 0300 if (line.startsWith(QLatin1String("//"))) { 0301 if (line.contains(QLatin1String("===END PRIVATE DOMAINS==="))) { 0302 seekToEndOfPrivateDomains = false; 0303 } 0304 0305 if (!loadPrivateDomains && line.contains(QLatin1String("===BEGIN PRIVATE DOMAINS==="))) { 0306 if (m_tldHash.isEmpty()) { 0307 seekToEndOfPrivateDomains = true; 0308 } 0309 else { 0310 break; 0311 } 0312 } 0313 0314 continue; 0315 } 0316 0317 if (seekToEndOfPrivateDomains) { 0318 continue; 0319 } 0320 0321 // Each line is only read up to the first whitespace 0322 line = line.left(line.indexOf(QLatin1Char(' '))); 0323 0324 if (!line.contains(QLatin1Char('.'))) { 0325 m_tldHash.insert(line, line); 0326 } 0327 else { 0328 QString key = line.mid(line.lastIndexOf(QLatin1Char('.')) + 1); 0329 0330 m_tldHash.insert(key, line); 0331 } 0332 } 0333 0334 return isDataLoaded(); 0335 } 0336 0337 QString TLDExtractor::normalizedHost(const QString &host) const 0338 { 0339 return host.toLower(); 0340 } 0341 0342 // methods for testing 0343 bool TLDExtractor::test() 0344 { 0345 if (!parseData(m_dataFileName, true)) { 0346 return false; 0347 } 0348 0349 QString testDataFileName; 0350 bool testDataFileExist = false; 0351 0352 for (const QString &path : std::as_const(m_dataSearchPaths)) { 0353 testDataFileName = QFileInfo(path + QLatin1String("/test_psl.txt")).absoluteFilePath(); 0354 0355 if (QFileInfo(testDataFileName).exists()) { 0356 testDataFileExist = true; 0357 break; 0358 } 0359 } 0360 0361 if (!testDataFileExist) { 0362 const QString testFileDownloadLink = QLatin1String("http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1"); 0363 0364 QMessageBox::information(nullptr, tr("File not found!"), 0365 tr("File \'test_psl.txt\' was not found!\n" 0366 "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2") 0367 .arg(testFileDownloadLink, m_dataSearchPaths.join(QStringLiteral("\n")))); 0368 0369 return false; 0370 } 0371 0372 QFile file(testDataFileName); 0373 0374 if (!file.open(QFile::ReadOnly | QFile::Text)) { 0375 return false; 0376 } 0377 0378 QRegExp testRegExp(QStringLiteral("checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);")); 0379 bool allTestSuccess = true; 0380 0381 while (!file.atEnd()) { 0382 QString line = QString::fromUtf8(file.readLine().constData()).simplified(); 0383 0384 if (line.startsWith(QLatin1String("//")) || line.isEmpty()) { 0385 continue; 0386 } 0387 0388 testRegExp.indexIn(line); 0389 0390 const QString hostName = testRegExp.cap(2); 0391 const QString registrableName = testRegExp.cap(4); 0392 0393 if (!checkPublicSuffix(hostName, registrableName)) { 0394 allTestSuccess = false; 0395 } 0396 } 0397 0398 if (allTestSuccess) { 0399 qWarning() << "TLDExtractor: Test passed successfully."; 0400 } 0401 else { 0402 qWarning() << "TLDExtractor: Test finished with some errors!"; 0403 } 0404 0405 // reset cache for normal use 0406 m_tldHash.clear(); 0407 0408 return allTestSuccess; 0409 } 0410 0411 bool TLDExtractor::checkPublicSuffix(const QString &hostName, const QString ®istrableName) 0412 { 0413 if (registrableDomain(hostName) != registrableName) { 0414 qWarning() << "TLDExtractor Test Error: hostName:" << hostName 0415 << "Correct registrableName:" << registrableName 0416 << "Wrong registrableName:" << registrableDomain(hostName); 0417 0418 return false; 0419 } 0420 0421 return true; 0422 }