File indexing completed on 2024-05-05 04:44:00
0001 /* 0002 SPDX-FileCopyrightText: 2005 Ivor Hewitt <ivor@kde.org> 0003 SPDX-FileCopyrightText: 2008 Maksim Orlovich <maksim@kde.org> 0004 SPDX-FileCopyrightText: 2008 Vyacheslav Tokarev <tsjoker@gmail.com> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 0009 #include "webkit_filter.h" 0010 #include "kwebkitpart_debug.h" 0011 0012 #include <QHash> 0013 #include <QBitArray> 0014 0015 // rolling hash parameters 0016 #define HASH_P (1997) 0017 #define HASH_Q (17509) 0018 // HASH_MOD = (HASH_P^7) % HASH_Q 0019 #define HASH_MOD (523) 0020 0021 0022 using namespace KDEPrivate; 0023 0024 // Updateable Multi-String Matcher based on Rabin-Karp's algorithm 0025 class StringsMatcher { 0026 public: 0027 // add filter to matching set 0028 void addString(const QString& pattern) 0029 { 0030 if (pattern.length() < 8) { 0031 // handle short string differently 0032 shortStringFilters.append(pattern); 0033 } else { 0034 // use modified Rabin-Karp's algorithm with 8-length string hash 0035 // i.e. store hash of first 8 chars in the HashMap for fast look-up 0036 stringFilters.append(pattern); 0037 int ind = stringFilters.size() - 1; 0038 int current = 0; 0039 0040 // compute hash using rolling hash 0041 // hash for string: x0,x1,x2...xn-1 will be: 0042 // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q 0043 // where p and q some wisely-chosen integers 0044 /*for (int k = 0; k < 8; ++k)*/ 0045 int len = pattern.length(); 0046 for (int k = len - 8; k < len; ++k) 0047 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q; 0048 0049 // insert computed hash value into HashMap 0050 QHash<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 0051 if (it == stringFiltersHash.end()) { 0052 QVector<int> list; 0053 list.append(ind); 0054 stringFiltersHash.insert(current + 1, list); 0055 fastLookUp.setBit(current); 0056 } else { 0057 it->append(ind); 0058 } 0059 } 0060 } 0061 0062 0063 // check if string match at least one string from matching set 0064 bool isMatched(const QString& str, QString *by = nullptr) const 0065 { 0066 // check short strings first 0067 for (int i = 0; i < shortStringFilters.size(); ++i) { 0068 if (str.contains(shortStringFilters[i])) 0069 { 0070 if (by != nullptr) *by = shortStringFilters[i]; 0071 return true; 0072 } 0073 } 0074 0075 int len = str.length(); 0076 int k; 0077 0078 int current = 0; 0079 int next = 0; 0080 // compute hash for first 8 characters 0081 for (k = 0; k < 8 && k < len; ++k) 0082 current = (current * HASH_P + str[k].unicode()) % HASH_Q; 0083 0084 QHash<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end(); 0085 // main Rabin-Karp's algorithm loop 0086 for (k = 7; k < len; ++k, current = next) { 0087 // roll the hash if not at the end 0088 // (calculate hash for the next iteration) 0089 if (k + 1 < len) 0090 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q; 0091 0092 if (!fastLookUp.testBit(current)) 0093 continue; 0094 0095 // look-up the hash in the HashMap and check all strings 0096 QHash<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1); 0097 0098 // check possible strings 0099 if (it != hashEnd) { 0100 for (int j = 0; j < it->size(); ++j) { 0101 int index = it->value(j); 0102 // check if we got simple string or REs prefix 0103 if (index >= 0) { 0104 int flen = stringFilters[index].length(); 0105 if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen)) 0106 { 0107 if (by != nullptr) *by = stringFilters[index]; 0108 return true; 0109 } 0110 } else { 0111 index = -index - 1; 0112 int flen = rePrefixes[index].length(); 0113 if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) 0114 { 0115 int remStart = k - 7 + flen; 0116 QString remainder = QString::fromRawData(str.unicode() + remStart, 0117 str.length() - remStart); 0118 if (reFilters[index].exactMatch(remainder)) { 0119 if (by != nullptr) *by = rePrefixes[index]+reFilters[index].pattern(); 0120 return true; 0121 } 0122 } 0123 } 0124 } 0125 } 0126 } 0127 0128 return false; 0129 } 0130 0131 // add filter to matching set with wildcards (*,?) in it 0132 void addWildedString(const QString& prefix, const QRegExp& rx) 0133 { 0134 rePrefixes.append(prefix); 0135 reFilters.append(rx); 0136 int index = -rePrefixes.size(); 0137 0138 int current = 0; 0139 for (int k = 0; k < 8; ++k) 0140 current = (current * HASH_P + prefix[k].unicode()) % HASH_Q; 0141 0142 // insert computed hash value into HashMap 0143 QHash<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); 0144 if (it == stringFiltersHash.end()) { 0145 QVector<int> list; 0146 list.append(index); 0147 stringFiltersHash.insert(current + 1, list); 0148 fastLookUp.setBit(current); 0149 } else { 0150 it->append(index); 0151 } 0152 } 0153 0154 void clear() 0155 { 0156 stringFilters.clear(); 0157 shortStringFilters.clear(); 0158 reFilters.clear(); 0159 rePrefixes.clear(); 0160 stringFiltersHash.clear(); 0161 fastLookUp.resize(HASH_Q); 0162 fastLookUp.fill(0, 0, HASH_Q); 0163 } 0164 0165 private: 0166 QVector<QString> stringFilters; 0167 QVector<QString> shortStringFilters; 0168 QVector<QRegExp> reFilters; 0169 QVector<QString> rePrefixes; 0170 QBitArray fastLookUp; 0171 0172 QHash<int, QVector<int> > stringFiltersHash; 0173 }; 0174 0175 0176 // We only want a subset of features of wildcards -- just the 0177 // star, so we escape the rest before passing to QRegExp. 0178 // The \ is escaped due to a QRegExp bug. 0179 // ### we really should rather parse it ourselves, in order to 0180 // handle adblock-special things like | and ^ properly. 0181 static QRegExp fromAdBlockWildcard(const QString& wcStr) { 0182 QRegExp rx; 0183 rx.setPatternSyntax(QRegExp::Wildcard); 0184 0185 QString out; 0186 for (int p = 0; p < wcStr.length(); ++p) { 0187 QChar c = wcStr[p]; 0188 if (c == QLatin1Char('?')) 0189 out += QLatin1String("[?]"); 0190 else if (c == QLatin1Char('[')) 0191 out += QLatin1String("[[]"); 0192 else if (c == QLatin1Char('\\')) 0193 out += QLatin1String("[\\]"); 0194 else 0195 out += c; 0196 } 0197 0198 rx.setPattern(out); 0199 return rx; 0200 } 0201 0202 FilterSet::FilterSet() 0203 :stringFiltersMatcher(new StringsMatcher) 0204 { 0205 } 0206 0207 FilterSet::~FilterSet() 0208 { 0209 delete stringFiltersMatcher; 0210 } 0211 0212 void FilterSet::addFilter(const QString& filterStr) 0213 { 0214 QString filter = filterStr; 0215 0216 /** ignore special lines starting with "[", "!", "&", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */ 0217 QChar firstChar = filter.at(0); 0218 if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) 0219 return; 0220 0221 // Strip leading @@ 0222 int first = 0; 0223 int last = filter.length() - 1; 0224 if (filter.startsWith(QLatin1String("@@"))) 0225 first = 2; 0226 0227 // Strip options, we ignore them for now. 0228 // TODO: Add support for filters with options. See #310230. 0229 int dollar = filter.lastIndexOf(QLatin1Char('$')); 0230 if (dollar != -1) { 0231 return; 0232 } 0233 0234 // Perhaps nothing left? 0235 if (first > last) 0236 return; 0237 0238 filter = filter.mid(first, last - first + 1); 0239 0240 // Is it a regexp filter? 0241 if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) 0242 { 0243 QString inside = filter.mid(1, filter.length()-2); 0244 QRegExp rx(inside); 0245 reFilters.append(rx); 0246 // qCDebug(KWEBKITPART_LOG) << "R:" << inside; 0247 } 0248 else 0249 { 0250 // Nope, a wildcard one. 0251 // Note: For these, we also need to handle |. 0252 0253 // Strip wildcards at the ends 0254 first = 0; 0255 last = filter.length() - 1; 0256 0257 while (first < filter.length() && filter[first] == QLatin1Char('*')) 0258 ++first; 0259 0260 while (last >= 0 && filter[last] == QLatin1Char('*')) 0261 --last; 0262 0263 if (first > last) 0264 filter = QLatin1String("*"); // erm... Well, they asked for it. 0265 else 0266 filter = filter.mid(first, last - first + 1); 0267 0268 // Now, do we still have any wildcard stuff left? 0269 if (filter.contains("*")) 0270 { 0271 // check if we can use RK first (and then check full RE for the rest) for better performance 0272 int aPos = filter.indexOf('*'); 0273 if (aPos < 0) 0274 aPos = filter.length(); 0275 if (aPos > 7) { 0276 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*')); 0277 // We pad the final r.e. with * so we can check for an exact match 0278 stringFiltersMatcher->addWildedString(filter.mid(0, aPos), rx); 0279 } else { 0280 QRegExp rx = fromAdBlockWildcard(filter); 0281 reFilters.append(rx); 0282 } 0283 } 0284 else 0285 { 0286 // Fast path 0287 stringFiltersMatcher->addString(filter); 0288 } 0289 } 0290 } 0291 0292 bool FilterSet::isUrlMatched(const QString& url) 0293 { 0294 if (stringFiltersMatcher->isMatched(url)) 0295 return true; 0296 0297 for (int c = 0; c < reFilters.size(); ++c) 0298 { 0299 if (url.contains(reFilters[c])) 0300 return true; 0301 } 0302 0303 return false; 0304 } 0305 0306 QString FilterSet::urlMatchedBy(const QString& url) 0307 { 0308 QString by; 0309 0310 if (stringFiltersMatcher->isMatched(url, &by)) 0311 return by; 0312 0313 for (int c = 0; c < reFilters.size(); ++c) 0314 { 0315 if (url.contains(reFilters[c])) 0316 { 0317 by = reFilters[c].pattern(); 0318 break; 0319 } 0320 } 0321 0322 return by; 0323 } 0324 0325 void FilterSet::clear() 0326 { 0327 reFilters.clear(); 0328 stringFiltersMatcher->clear(); 0329 } 0330 0331 // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;