File indexing completed on 2023-05-30 09:09:51
0001 /* This file is part of the KDE project 0002 0003 Copyright (C) 2005 Ivor Hewitt <ivor@kde.org> 0004 Copyright (C) 2008 Maksim Orlovich <maksim@kde.org> 0005 Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com> 0006 0007 This library is free software; you can redistribute it and/or 0008 modify it under the terms of the GNU Library General Public 0009 License as published by the Free Software Foundation; either 0010 version 2 of the License, or (at your option) any later version. 0011 0012 This library is distributed in the hope that it will be useful, 0013 but WITHOUT ANY WARRANTY; without even the implied warranty of 0014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0015 Library General Public License for more details. 0016 0017 You should have received a copy of the GNU Library General Public License 0018 along with this library; see the file COPYING.LIB. If not, write to 0019 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0020 Boston, MA 02110-1301, USA. 0021 */ 0022 0023 #include "khtml_filter_p.h" 0024 #include "khtml_debug.h" 0025 0026 // rolling hash parameters 0027 #define HASH_P (1997) 0028 #define HASH_Q (17509) 0029 // HASH_MOD = (HASH_P^7) % HASH_Q 0030 #define HASH_MOD (523) 0031 0032 namespace khtml 0033 { 0034 0035 // We only want a subset of features of wildcards -- just the 0036 // star, so we escape the rest before passing to QRegExp. 0037 // The \ is escaped due to a QRegExp bug. 0038 // ### we really should rather parse it ourselves, in order to 0039 // handle adblock-special things like | and ^ properly. 0040 static QRegExp fromAdBlockWildcard(const QString &wcStr) 0041 { 0042 QRegExp rx; 0043 rx.setPatternSyntax(QRegExp::Wildcard); 0044 0045 QString out; 0046 for (int p = 0; p < wcStr.length(); ++p) { 0047 QChar c = wcStr[p]; 0048 if (c == QLatin1Char('?')) { 0049 out += QLatin1String("[?]"); 0050 } else if (c == QLatin1Char('[')) { 0051 out += QLatin1String("[[]"); 0052 } else if (c == QLatin1Char('\\')) { 0053 out += QLatin1String("[\\]"); 0054 } else { 0055 out += c; 0056 } 0057 } 0058 0059 rx.setPattern(out); 0060 return rx; 0061 } 0062 0063 void FilterSet::addFilter(const QString &filterStr) 0064 { 0065 QString filter = filterStr.trimmed(); 0066 if (filter.isEmpty()) { 0067 return; 0068 } 0069 0070 /** ignore special lines starting with "[", "!", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */ 0071 const QChar firstChar = filter.at(0); 0072 if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) { 0073 return; 0074 } 0075 0076 // Strip leading @@ 0077 if (filter.startsWith(QLatin1String("@@"))) { 0078 filter.remove(0, 2); 0079 } 0080 0081 // Strip options, we ignore them for now. 0082 bool hadOptions = false; 0083 int dollar = filter.lastIndexOf(QLatin1Char('$')); 0084 if (dollar != -1) { 0085 // Is it adblock's options delimiter or the special '$' char in a regular expression? 0086 if (!filter.startsWith(QLatin1Char('/')) || !filter.endsWith(QLatin1Char('/'))) { 0087 filter = filter.mid(0, dollar); 0088 hadOptions = true; 0089 } 0090 } 0091 0092 // Is it a regexp filter? 0093 if (filter.length() > 2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) { 0094 // Ignore regexp that had options as we do not support them 0095 if (hadOptions) { 0096 return; 0097 } 0098 QString inside = filter.mid(1, filter.length() - 2); 0099 QRegExp rx(inside); 0100 reFilters.append(rx); 0101 //qCDebug(KHTML_LOG) << "R:" << inside; 0102 return; 0103 } 0104 // Nope, a wildcard one. 0105 0106 // Disregard the rule if only one char is left or it contains unsupported adblock features ('|', "||", '^') 0107 if (filter.length() < 2 || filter.contains(QLatin1Char('|')) || filter.contains(QLatin1Char('^'))) { 0108 return; 0109 } 0110 0111 // Strip wildcards at the ends 0112 int first = 0; 0113 int last = filter.length() - 1; 0114 0115 while (first < filter.length() && filter.at(first) == QLatin1Char('*')) { 0116 ++first; 0117 } 0118 0119 while (last >= 0 && filter.at(last) == QLatin1Char('*')) { 0120 --last; 0121 } 0122 0123 if (first > last) { 0124 filter = QLatin1String("*"); // erm... Well, they asked for it. 0125 } else { 0126 filter = filter.mid(first, last - first + 1); 0127 } 0128 0129 // Now, do we still have any wildcard stuff left? 0130 int aPos = filter.indexOf('*'); 0131 if (aPos != -1) { 0132 // check if we can use RK first (and then check full RE for the rest) for better performance 0133 if (aPos > 7) { 0134 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*')); 0135 // We pad the final r.e. with * so we can check for an exact match 0136 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx); 0137 } else { 0138 QRegExp rx = fromAdBlockWildcard(filter); 0139 reFilters.append(rx); 0140 } 0141 } else { 0142 // Fast path 0143 stringFiltersMatcher.addString(filter); 0144 } 0145 0146 } 0147 0148 bool FilterSet::isUrlMatched(const QString &url) 0149 { 0150 if (stringFiltersMatcher.isMatched(url)) { 0151 return true; 0152 } 0153 0154 for (int c = 0; c < reFilters.size(); ++c) { 0155 if (url.contains(reFilters[c])) { 0156 return true; 0157 } 0158 } 0159 0160 return false; 0161 } 0162 0163 QString FilterSet::urlMatchedBy(const QString &url) 0164 { 0165 QString by; 0166 0167 if (stringFiltersMatcher.isMatched(url, &by)) { 0168 return by; 0169 } 0170 0171 for (int c = 0; c < reFilters.size(); ++c) { 0172 if (url.contains(reFilters[c])) { 0173 by = reFilters[c].pattern(); 0174 break; 0175 } 0176 } 0177 0178 return by; 0179 } 0180 0181 void FilterSet::clear() 0182 { 0183 reFilters.clear(); 0184 stringFiltersMatcher.clear(); 0185 } 0186 0187 void StringsMatcher::addString(const QString &pattern) 0188 { 0189 if (pattern.length() < 8) { 0190 // handle short string differently 0191 shortStringFilters.append(pattern); 0192 } else { 0193 // use modified Rabin-Karp's algorithm with 8-length string hash 0194 // i.e. store hash of first 8 chars in the HashMap for fast look-up 0195 stringFilters.append(pattern); 0196 int ind = stringFilters.size() - 1; 0197 int current = 0; 0198 0199 // compute hash using rolling hash 0200 // hash for string: x0,x1,x2...xn-1 will be: 0201 // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q 0202 // where p and q some wisely-chosen integers 0203 /*for (int k = 0; k < 8; ++k)*/ 0204 int len = pattern.length(); 0205 for (int k = len - 8; k < len; ++k) { 0206 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q; 0207 } 0208 0209 // insert computed hash value into HashMap 0210 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 0211 if (it == stringFiltersHash.end()) { 0212 QVector<int> list; 0213 list.append(ind); 0214 stringFiltersHash.add(current + 1, list); 0215 fastLookUp.setBit(current); 0216 } else { 0217 it->second.append(ind); 0218 } 0219 } 0220 } 0221 0222 void StringsMatcher::addWildedString(const QString &prefix, const QRegExp &rx) 0223 { 0224 rePrefixes.append(prefix); 0225 reFilters.append(rx); 0226 int index = -rePrefixes.size(); 0227 0228 int current = 0; 0229 for (int k = 0; k < 8; ++k) { 0230 current = (current * HASH_P + prefix[k].unicode()) % HASH_Q; 0231 } 0232 0233 // insert computed hash value into HashMap 0234 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 0235 if (it == stringFiltersHash.end()) { 0236 QVector<int> list; 0237 list.append(index); 0238 stringFiltersHash.add(current + 1, list); 0239 fastLookUp.setBit(current); 0240 } else { 0241 it->second.append(index); 0242 } 0243 } 0244 0245 bool StringsMatcher::isMatched(const QString &str, QString *by) const 0246 { 0247 // check short strings first 0248 for (int i = 0; i < shortStringFilters.size(); ++i) { 0249 if (str.contains(shortStringFilters[i])) { 0250 if (by != nullptr) { 0251 *by = shortStringFilters[i]; 0252 } 0253 return true; 0254 } 0255 } 0256 0257 int len = str.length(); 0258 int k; 0259 0260 int current = 0; 0261 int next = 0; 0262 // compute hash for first 8 characters 0263 for (k = 0; k < 8 && k < len; ++k) { 0264 current = (current * HASH_P + str[k].unicode()) % HASH_Q; 0265 } 0266 0267 WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end(); 0268 // main Rabin-Karp's algorithm loop 0269 for (k = 7; k < len; ++k, current = next) { 0270 // roll the hash if not at the end 0271 // (calculate hash for the next iteration) 0272 if (k + 1 < len) { 0273 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q; 0274 } 0275 0276 if (!fastLookUp.testBit(current)) { 0277 continue; 0278 } 0279 0280 // look-up the hash in the HashMap and check all strings 0281 WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1); 0282 0283 // check possible strings 0284 if (it != hashEnd) { 0285 for (int j = 0; j < it->second.size(); ++j) { 0286 int index = it->second[j]; 0287 // check if we got simple string or REs prefix 0288 if (index >= 0) { 0289 int flen = stringFilters[index].length(); 0290 if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1, flen)) { 0291 if (by != nullptr) { 0292 *by = stringFilters[index]; 0293 } 0294 return true; 0295 } 0296 } else { 0297 index = -index - 1; 0298 int flen = rePrefixes[index].length(); 0299 if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) { 0300 int remStart = k - 7 + flen; 0301 QString remainder = QString::fromRawData(str.unicode() + remStart, 0302 str.length() - remStart); 0303 if (reFilters[index].exactMatch(remainder)) { 0304 if (by != nullptr) { 0305 *by = rePrefixes[index] + reFilters[index].pattern(); 0306 } 0307 return true; 0308 } 0309 } 0310 } 0311 } 0312 } 0313 } 0314 0315 return false; 0316 } 0317 0318 void StringsMatcher::clear() 0319 { 0320 stringFilters.clear(); 0321 shortStringFilters.clear(); 0322 reFilters.clear(); 0323 rePrefixes.clear(); 0324 stringFiltersHash.clear(); 0325 fastLookUp.resize(HASH_Q); 0326 fastLookUp.fill(0, 0, HASH_Q); 0327 } 0328 0329 } 0330