File indexing completed on 2024-03-24 15:32:24

0001 /* This file is part of the KDE project
0002 
0003    Copyright (C) 2005 Ivor Hewitt     <ivor@kde.org>
0004    Copyright (C) 2008 Maksim Orlovich <maksim@kde.org>
0005    Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com>
0006 
0007    This library is free software; you can redistribute it and/or
0008    modify it under the terms of the GNU Library General Public
0009    License as published by the Free Software Foundation; either
0010    version 2 of the License, or (at your option) any later version.
0011 
0012    This library is distributed in the hope that it will be useful,
0013    but WITHOUT ANY WARRANTY; without even the implied warranty of
0014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0015    Library General Public License for more details.
0016 
0017    You should have received a copy of the GNU Library General Public License
0018    along with this library; see the file COPYING.LIB.  If not, write to
0019    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0020    Boston, MA 02110-1301, USA.
0021 */
0022 
0023 #include "khtml_filter_p.h"
0024 #include "khtml_debug.h"
0025 
0026 // rolling hash parameters
0027 #define HASH_P (1997)
0028 #define HASH_Q (17509)
0029 // HASH_MOD = (HASH_P^7) % HASH_Q
0030 #define HASH_MOD (523)
0031 
0032 namespace khtml
0033 {
0034 
0035 // We only want a subset of features of wildcards -- just the
0036 // star, so we escape the rest before passing to QRegExp.
0037 // The \ is escaped due to a QRegExp bug.
0038 // ### we really should rather parse it ourselves, in order to
0039 // handle adblock-special things like | and ^ properly.
0040 static QRegExp fromAdBlockWildcard(const QString &wcStr)
0041 {
0042     QRegExp rx;
0043     rx.setPatternSyntax(QRegExp::Wildcard);
0044 
0045     QString out;
0046     for (int p = 0; p < wcStr.length(); ++p) {
0047         QChar c = wcStr[p];
0048         if (c == QLatin1Char('?')) {
0049             out += QLatin1String("[?]");
0050         } else if (c == QLatin1Char('[')) {
0051             out += QLatin1String("[[]");
0052         } else if (c == QLatin1Char('\\')) {
0053             out += QLatin1String("[\\]");
0054         } else {
0055             out += c;
0056         }
0057     }
0058 
0059     rx.setPattern(out);
0060     return rx;
0061 }
0062 
0063 void FilterSet::addFilter(const QString &filterStr)
0064 {
0065     QString filter = filterStr.trimmed();
0066     if (filter.isEmpty()) {
0067         return;
0068     }
0069 
0070     /** ignore special lines starting with "[", "!", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */
0071     const QChar firstChar = filter.at(0);
0072     if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) {
0073         return;
0074     }
0075 
0076     // Strip leading @@
0077     if (filter.startsWith(QLatin1String("@@"))) {
0078         filter.remove(0, 2);
0079     }
0080 
0081     // Strip options, we ignore them for now.
0082     bool hadOptions = false;
0083     int dollar = filter.lastIndexOf(QLatin1Char('$'));
0084     if (dollar != -1) {
0085         // Is it adblock's options delimiter or the special '$' char in a regular expression?
0086         if (!filter.startsWith(QLatin1Char('/')) || !filter.endsWith(QLatin1Char('/'))) {
0087             filter = filter.mid(0, dollar);
0088             hadOptions = true;
0089         }
0090     }
0091 
0092     // Is it a regexp filter?
0093     if (filter.length() > 2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) {
0094         // Ignore regexp that had options as we do not support them
0095         if (hadOptions) {
0096             return;
0097         }
0098         QString inside = filter.mid(1, filter.length() - 2);
0099         QRegExp rx(inside);
0100         reFilters.append(rx);
0101         //qCDebug(KHTML_LOG) << "R:" << inside;
0102         return;
0103     }
0104     // Nope, a wildcard one.
0105 
0106     // Disregard the rule if only one char is left or it contains unsupported adblock features ('|', "||", '^')
0107     if (filter.length() < 2 || filter.contains(QLatin1Char('|')) || filter.contains(QLatin1Char('^'))) {
0108         return;
0109     }
0110 
0111     // Strip wildcards at the ends
0112     int first = 0;
0113     int last  = filter.length() - 1;
0114 
0115     while (first < filter.length() && filter.at(first) == QLatin1Char('*')) {
0116         ++first;
0117     }
0118 
0119     while (last >= 0 && filter.at(last) == QLatin1Char('*')) {
0120         --last;
0121     }
0122 
0123     if (first > last) {
0124         filter = QLatin1String("*"); // erm... Well, they asked for it.
0125     } else {
0126         filter = filter.mid(first, last - first + 1);
0127     }
0128 
0129     // Now, do we still have any wildcard stuff left?
0130     int aPos = filter.indexOf('*');
0131     if (aPos != -1) {
0132         // check if we can use RK first (and then check full RE for the rest) for better performance
0133         if (aPos > 7) {
0134             QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
0135             // We pad the final r.e. with * so we can check for an exact match
0136             stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx);
0137         } else {
0138             QRegExp rx = fromAdBlockWildcard(filter);
0139             reFilters.append(rx);
0140         }
0141     } else {
0142         // Fast path
0143         stringFiltersMatcher.addString(filter);
0144     }
0145 
0146 }
0147 
0148 bool FilterSet::isUrlMatched(const QString &url)
0149 {
0150     if (stringFiltersMatcher.isMatched(url)) {
0151         return true;
0152     }
0153 
0154     for (int c = 0; c < reFilters.size(); ++c) {
0155         if (url.contains(reFilters[c])) {
0156             return true;
0157         }
0158     }
0159 
0160     return false;
0161 }
0162 
0163 QString FilterSet::urlMatchedBy(const QString &url)
0164 {
0165     QString by;
0166 
0167     if (stringFiltersMatcher.isMatched(url, &by)) {
0168         return by;
0169     }
0170 
0171     for (int c = 0; c < reFilters.size(); ++c) {
0172         if (url.contains(reFilters[c])) {
0173             by = reFilters[c].pattern();
0174             break;
0175         }
0176     }
0177 
0178     return by;
0179 }
0180 
0181 void FilterSet::clear()
0182 {
0183     reFilters.clear();
0184     stringFiltersMatcher.clear();
0185 }
0186 
0187 void StringsMatcher::addString(const QString &pattern)
0188 {
0189     if (pattern.length() < 8) {
0190         // handle short string differently
0191         shortStringFilters.append(pattern);
0192     } else {
0193         // use modified Rabin-Karp's algorithm with 8-length string hash
0194         // i.e. store hash of first 8 chars in the HashMap for fast look-up
0195         stringFilters.append(pattern);
0196         int ind = stringFilters.size() - 1;
0197         int current = 0;
0198 
0199         // compute hash using rolling hash
0200         // hash for string: x0,x1,x2...xn-1 will be:
0201         // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
0202         // where p and q some wisely-chosen integers
0203         /*for (int k = 0; k < 8; ++k)*/
0204         int len = pattern.length();
0205         for (int k = len - 8; k < len; ++k) {
0206             current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
0207         }
0208 
0209         // insert computed hash value into HashMap
0210         WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
0211         if (it == stringFiltersHash.end()) {
0212             QVector<int> list;
0213             list.append(ind);
0214             stringFiltersHash.add(current + 1, list);
0215             fastLookUp.setBit(current);
0216         } else {
0217             it->second.append(ind);
0218         }
0219     }
0220 }
0221 
0222 void StringsMatcher::addWildedString(const QString &prefix, const QRegExp &rx)
0223 {
0224     rePrefixes.append(prefix);
0225     reFilters.append(rx);
0226     int index = -rePrefixes.size();
0227 
0228     int current = 0;
0229     for (int k = 0; k < 8; ++k) {
0230         current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
0231     }
0232 
0233     // insert computed hash value into HashMap
0234     WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
0235     if (it == stringFiltersHash.end()) {
0236         QVector<int> list;
0237         list.append(index);
0238         stringFiltersHash.add(current + 1, list);
0239         fastLookUp.setBit(current);
0240     } else {
0241         it->second.append(index);
0242     }
0243 }
0244 
0245 bool StringsMatcher::isMatched(const QString &str, QString *by) const
0246 {
0247     // check short strings first
0248     for (int i = 0; i < shortStringFilters.size(); ++i) {
0249         if (str.contains(shortStringFilters[i])) {
0250             if (by != nullptr) {
0251                 *by = shortStringFilters[i];
0252             }
0253             return true;
0254         }
0255     }
0256 
0257     int len = str.length();
0258     int k;
0259 
0260     int current = 0;
0261     int next = 0;
0262     // compute hash for first 8 characters
0263     for (k = 0; k < 8 && k < len; ++k) {
0264         current = (current * HASH_P + str[k].unicode()) % HASH_Q;
0265     }
0266 
0267     WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
0268     // main Rabin-Karp's algorithm loop
0269     for (k = 7; k < len; ++k, current = next) {
0270         // roll the hash if not at the end
0271         // (calculate hash for the next iteration)
0272         if (k + 1 < len) {
0273             next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
0274         }
0275 
0276         if (!fastLookUp.testBit(current)) {
0277             continue;
0278         }
0279 
0280         // look-up the hash in the HashMap and check all strings
0281         WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
0282 
0283         // check possible strings
0284         if (it != hashEnd) {
0285             for (int j = 0; j < it->second.size(); ++j) {
0286                 int index = it->second[j];
0287                 // check if we got simple string or REs prefix
0288                 if (index >= 0) {
0289                     int flen = stringFilters[index].length();
0290                     if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1, flen)) {
0291                         if (by != nullptr) {
0292                             *by = stringFilters[index];
0293                         }
0294                         return true;
0295                     }
0296                 } else {
0297                     index = -index - 1;
0298                     int flen = rePrefixes[index].length();
0299                     if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) {
0300                         int remStart = k - 7 + flen;
0301                         QString remainder = QString::fromRawData(str.unicode() + remStart,
0302                                             str.length() - remStart);
0303                         if (reFilters[index].exactMatch(remainder)) {
0304                             if (by != nullptr) {
0305                                 *by = rePrefixes[index] + reFilters[index].pattern();
0306                             }
0307                             return true;
0308                         }
0309                     }
0310                 }
0311             }
0312         }
0313     }
0314 
0315     return false;
0316 }
0317 
0318 void StringsMatcher::clear()
0319 {
0320     stringFilters.clear();
0321     shortStringFilters.clear();
0322     reFilters.clear();
0323     rePrefixes.clear();
0324     stringFiltersHash.clear();
0325     fastLookUp.resize(HASH_Q);
0326     fastLookUp.fill(0, 0, HASH_Q);
0327 }
0328 
0329 }
0330