File indexing completed on 2024-05-05 04:44:00

0001 /*
0002     SPDX-FileCopyrightText: 2005 Ivor Hewitt <ivor@kde.org>
0003     SPDX-FileCopyrightText: 2008 Maksim Orlovich <maksim@kde.org>
0004     SPDX-FileCopyrightText: 2008 Vyacheslav Tokarev <tsjoker@gmail.com>
0005 
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008 
0009 #include "webkit_filter.h"
0010 #include "kwebkitpart_debug.h"
0011 
0012 #include <QHash>
0013 #include <QBitArray>
0014 
0015 // rolling hash parameters
0016 #define HASH_P (1997)
0017 #define HASH_Q (17509)
0018 // HASH_MOD = (HASH_P^7) % HASH_Q
0019 #define HASH_MOD (523)
0020 
0021 
0022 using namespace KDEPrivate;
0023 
0024 // Updateable Multi-String Matcher based on Rabin-Karp's algorithm
0025 class StringsMatcher {
0026 public:
0027     // add filter to matching set
0028     void addString(const QString& pattern)
0029     {
0030         if (pattern.length() < 8) {
0031             // handle short string differently
0032             shortStringFilters.append(pattern);
0033         } else {
0034             // use modified Rabin-Karp's algorithm with 8-length string hash
0035             // i.e. store hash of first 8 chars in the HashMap for fast look-up
0036             stringFilters.append(pattern);
0037             int ind = stringFilters.size() - 1;
0038             int current = 0;
0039 
0040             // compute hash using rolling hash
0041             // hash for string: x0,x1,x2...xn-1 will be:
0042             // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
0043             // where p and q some wisely-chosen integers
0044             /*for (int k = 0; k < 8; ++k)*/
0045             int len = pattern.length();
0046             for (int k = len - 8; k < len; ++k)
0047                 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
0048 
0049             // insert computed hash value into HashMap
0050             QHash<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
0051             if (it == stringFiltersHash.end()) {
0052                 QVector<int> list;
0053                 list.append(ind);
0054                 stringFiltersHash.insert(current + 1, list);
0055                 fastLookUp.setBit(current);
0056             } else {
0057                 it->append(ind);
0058             }
0059         }
0060     }
0061 
0062 
0063     // check if string match at least one string from matching set
0064     bool isMatched(const QString& str, QString *by = nullptr) const
0065     {
0066         // check short strings first
0067         for (int i = 0; i < shortStringFilters.size(); ++i) {
0068             if (str.contains(shortStringFilters[i]))
0069             {
0070                 if (by != nullptr) *by = shortStringFilters[i];
0071                 return true;
0072             }
0073         }
0074 
0075         int len = str.length();
0076         int k;
0077 
0078         int current = 0;
0079         int next = 0;
0080         // compute hash for first 8 characters
0081         for (k = 0; k < 8 && k < len; ++k)
0082             current = (current * HASH_P + str[k].unicode()) % HASH_Q;
0083 
0084         QHash<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
0085         // main Rabin-Karp's algorithm loop
0086         for (k = 7; k < len; ++k, current = next) {
0087             // roll the hash if not at the end
0088             // (calculate hash for the next iteration)
0089             if (k + 1 < len)
0090                 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
0091 
0092             if (!fastLookUp.testBit(current))
0093                 continue;
0094 
0095             // look-up the hash in the HashMap and check all strings
0096             QHash<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
0097 
0098             // check possible strings
0099             if (it != hashEnd) {
0100                 for (int j = 0; j < it->size(); ++j) {
0101                     int index = it->value(j);
0102                     // check if we got simple string or REs prefix
0103                     if (index >= 0) {
0104                         int flen = stringFilters[index].length();
0105                         if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen))
0106                         {
0107                             if (by != nullptr) *by = stringFilters[index];
0108                             return true;
0109                         }
0110                     } else {
0111                         index = -index - 1;
0112                         int flen = rePrefixes[index].length();
0113                         if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen))
0114                         {
0115                             int remStart = k - 7 + flen;
0116                             QString remainder = QString::fromRawData(str.unicode() + remStart,
0117                                                                     str.length() - remStart);
0118                             if (reFilters[index].exactMatch(remainder)) {
0119                                 if (by != nullptr) *by = rePrefixes[index]+reFilters[index].pattern();
0120                                 return true;
0121                             }
0122                         }
0123                     }
0124                 }
0125             }
0126         }
0127 
0128         return false;
0129     }
0130 
0131     // add filter to matching set with wildcards (*,?) in it
0132     void addWildedString(const QString& prefix, const QRegExp& rx)
0133     {
0134         rePrefixes.append(prefix);
0135         reFilters.append(rx);
0136         int index = -rePrefixes.size();
0137 
0138         int current = 0;
0139         for (int k = 0; k < 8; ++k)
0140             current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
0141 
0142         // insert computed hash value into HashMap
0143         QHash<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
0144         if (it == stringFiltersHash.end()) {
0145             QVector<int> list;
0146             list.append(index);
0147             stringFiltersHash.insert(current + 1, list);
0148             fastLookUp.setBit(current);
0149         } else {
0150             it->append(index);
0151         }
0152     }
0153 
0154     void clear()
0155     {
0156         stringFilters.clear();
0157         shortStringFilters.clear();
0158         reFilters.clear();
0159         rePrefixes.clear();
0160         stringFiltersHash.clear();
0161         fastLookUp.resize(HASH_Q);
0162         fastLookUp.fill(0, 0, HASH_Q);
0163     }
0164 
0165 private:
0166     QVector<QString> stringFilters;
0167     QVector<QString> shortStringFilters;
0168     QVector<QRegExp> reFilters;
0169     QVector<QString> rePrefixes;
0170     QBitArray fastLookUp;
0171 
0172     QHash<int, QVector<int> > stringFiltersHash;
0173 };
0174 
0175 
0176 // We only want a subset of features of wildcards -- just the 
0177 // star, so we escape the rest before passing to QRegExp. 
0178 // The \ is escaped due to a QRegExp bug.
0179 // ### we really should rather parse it ourselves, in order to 
0180 // handle adblock-special things like | and ^ properly.
0181 static QRegExp fromAdBlockWildcard(const QString& wcStr) {
0182     QRegExp rx;
0183     rx.setPatternSyntax(QRegExp::Wildcard);
0184 
0185     QString out;
0186     for (int p = 0; p < wcStr.length(); ++p) {
0187         QChar c = wcStr[p];
0188         if (c == QLatin1Char('?'))
0189             out += QLatin1String("[?]");
0190         else if (c == QLatin1Char('['))
0191             out += QLatin1String("[[]");
0192         else if (c == QLatin1Char('\\'))
0193             out += QLatin1String("[\\]");
0194         else
0195             out += c;
0196     }
0197 
0198     rx.setPattern(out);
0199     return rx;
0200 }
0201 
0202 FilterSet::FilterSet()
0203     :stringFiltersMatcher(new StringsMatcher)
0204 {
0205 }
0206 
0207 FilterSet::~FilterSet()
0208 {
0209     delete stringFiltersMatcher;
0210 }
0211 
0212 void FilterSet::addFilter(const QString& filterStr)
0213 {
0214     QString filter = filterStr;
0215 
0216     /** ignore special lines starting with "[", "!", "&", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */
0217     QChar firstChar = filter.at(0);
0218     if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#')))
0219         return;
0220 
0221     // Strip leading @@
0222     int first = 0;
0223     int last  = filter.length() - 1;
0224     if (filter.startsWith(QLatin1String("@@")))
0225         first = 2;
0226 
0227     // Strip options, we ignore them for now.
0228     // TODO: Add support for filters with options. See #310230.
0229     int dollar = filter.lastIndexOf(QLatin1Char('$'));
0230     if (dollar != -1) {
0231         return;
0232     }
0233 
0234     // Perhaps nothing left?
0235     if (first > last)
0236         return;
0237 
0238     filter = filter.mid(first, last - first + 1);
0239 
0240     // Is it a regexp filter?
0241     if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/')))
0242     {
0243         QString inside = filter.mid(1, filter.length()-2);
0244         QRegExp rx(inside);
0245         reFilters.append(rx);
0246 //         qCDebug(KWEBKITPART_LOG) << "R:" << inside;
0247     }
0248     else
0249     {
0250         // Nope, a wildcard one.
0251         // Note: For these, we also need to handle |.
0252 
0253         // Strip wildcards at the ends
0254         first = 0;
0255         last  = filter.length() - 1;
0256 
0257         while (first < filter.length() && filter[first] == QLatin1Char('*'))
0258             ++first;
0259 
0260         while (last >= 0 && filter[last] == QLatin1Char('*'))
0261             --last;
0262 
0263         if (first > last)
0264             filter = QLatin1String("*"); // erm... Well, they asked for it.
0265         else
0266             filter = filter.mid(first, last - first + 1);
0267 
0268         // Now, do we still have any wildcard stuff left?
0269         if (filter.contains("*"))
0270         {
0271             // check if we can use RK first (and then check full RE for the rest) for better performance
0272             int aPos = filter.indexOf('*');
0273             if (aPos < 0)
0274                 aPos = filter.length();
0275             if (aPos > 7) {
0276                 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
0277                 // We pad the final r.e. with * so we can check for an exact match
0278                 stringFiltersMatcher->addWildedString(filter.mid(0, aPos), rx);
0279             } else {
0280                 QRegExp rx = fromAdBlockWildcard(filter);
0281                 reFilters.append(rx);
0282             }
0283         }
0284         else
0285         {
0286             // Fast path
0287             stringFiltersMatcher->addString(filter);
0288         }
0289     }
0290 }
0291 
0292 bool FilterSet::isUrlMatched(const QString& url)
0293 {
0294     if (stringFiltersMatcher->isMatched(url))
0295         return true;
0296 
0297     for (int c = 0; c < reFilters.size(); ++c)
0298     {
0299         if (url.contains(reFilters[c]))
0300             return true;
0301     }
0302 
0303     return false;
0304 }
0305 
0306 QString FilterSet::urlMatchedBy(const QString& url)
0307 {
0308     QString by;
0309 
0310     if (stringFiltersMatcher->isMatched(url, &by))
0311         return by;
0312 
0313     for (int c = 0; c < reFilters.size(); ++c)
0314     {
0315         if (url.contains(reFilters[c]))
0316         {
0317             by = reFilters[c].pattern();
0318             break;
0319         }
0320     }
0321 
0322     return by;
0323 }
0324 
0325 void FilterSet::clear()
0326 {
0327     reFilters.clear();
0328     stringFiltersMatcher->clear();
0329 }
0330 
0331 // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;