File indexing completed on 2024-05-12 11:54:37

0001 /*
0002     This file is part of the KDE project
0003     SPDX-FileCopyrightText: 2002, 2003 Dawit Alemayehu <adawit@kde.org>
0004     SPDX-FileCopyrightText: 2000 Yves Arrouye <yves@realnames.com>
0005     SPDX-FileCopyrightText: 1999 Simon Hausmann <hausmann@kde.org>
0006 
0007     Advanced web shortcuts:
0008     SPDX-FileCopyrightText: 2001 Andreas Hochsteger <e9625392@student.tuwien.ac.at>
0009 
0010     SPDX-License-Identifier: GPL-2.0-or-later
0011 */
0012 
0013 #include "kuriikwsfiltereng.h"
0014 #include "searchprovider.h"
0015 
0016 #include <KConfig>
0017 #include <KConfigGroup>
0018 #include <kprotocolinfo.h>
0019 
0020 #include <QLoggingCategory>
0021 #include <QRegularExpression>
0022 #include <QTextCodec>
0023 
0024 namespace
0025 {
0026 Q_LOGGING_CATEGORY(category, "kf.kio.urifilters.ikws", QtWarningMsg)
0027 }
0028 
0029 static void kuriikws_debug(const QString &n, const QString &v)
0030 {
0031     qCDebug(category) << n << " = '" << v << "'";
0032 }
0033 
0034 /**
0035  * IMPORTANT: If you change anything here, make sure kiowidgets-kurifiltertest-{colon,space}-separator
0036  * unit tests still pass (they're usually run as part of "make test").
0037  */
0038 
0039 KURISearchFilterEngine::KURISearchFilterEngine()
0040 {
0041     loadConfig();
0042 }
0043 
0044 KURISearchFilterEngine::~KURISearchFilterEngine()
0045 {
0046 }
0047 
0048 // static
0049 QStringList KURISearchFilterEngine::defaultSearchProviders()
0050 {
0051     static const QStringList defaultProviders{QStringLiteral("google"),
0052                                               QStringLiteral("youtube"),
0053                                               QStringLiteral("yahoo"),
0054                                               QStringLiteral("wikipedia"),
0055                                               QStringLiteral("wikit")};
0056     return defaultProviders;
0057 }
0058 
0059 SearchProvider *KURISearchFilterEngine::webShortcutQuery(const QString &typedString, QString &searchTerm) const
0060 {
0061     const auto getProviderForKey = [this, &searchTerm](const QString &key) {
0062         SearchProvider *provider = nullptr;
0063         // If the key contains a : an assertion in the isKnownProtocol method would fail. This can be
0064         // the case if the delimiter is switched to space, see kiowidgets_space_separator_test
0065         if (!key.isEmpty() && (key.contains(QLatin1Char(':')) || !KProtocolInfo::isKnownProtocol(key))) {
0066             provider = m_registry.findByKey(key);
0067             if (provider) {
0068                 if (!m_bUseOnlyPreferredWebShortcuts || m_preferredWebShortcuts.contains(provider->desktopEntryName())) {
0069                     qCDebug(category) << "found provider" << provider->desktopEntryName() << "searchTerm=" << searchTerm;
0070                 } else {
0071                     provider = nullptr;
0072                 }
0073             }
0074         }
0075         return provider;
0076     };
0077 
0078     SearchProvider *provider = nullptr;
0079     if (m_bWebShortcutsEnabled) {
0080         QString key;
0081         if (typedString.contains(QLatin1Char('!'))) {
0082             const static QRegularExpression bangRegex(QStringLiteral("!([^ ]+)"));
0083             const auto match = bangRegex.match(typedString);
0084             if (match.hasMatch() && match.lastCapturedIndex() == 1) {
0085                 key = match.captured(1);
0086                 searchTerm = QString(typedString).remove(bangRegex);
0087             }
0088         }
0089 
0090         // If we have found a bang-match it might be unintentionally triggered, because the ! character is contained
0091         // in the query. To avoid not returning any results we check if we can find a provider for the key, if not
0092         // we clear it and try the traditional query syntax, see https://bugs.kde.org/show_bug.cgi?id=437660
0093         if (!key.isEmpty()) {
0094             provider = getProviderForKey(key);
0095             if (!provider) {
0096                 key.clear();
0097             }
0098         }
0099         if (key.isEmpty()) {
0100             const int pos = typedString.indexOf(QLatin1Char(m_cKeywordDelimiter));
0101             if (pos > -1) {
0102                 key = typedString.left(pos).toLower(); // #169801
0103                 searchTerm = typedString.mid(pos + 1);
0104             } else if (!typedString.isEmpty() && m_cKeywordDelimiter == ' ') {
0105                 key = typedString;
0106                 searchTerm = typedString.mid(pos + 1);
0107             }
0108             provider = getProviderForKey(key);
0109         }
0110 
0111         qCDebug(category) << "m_cKeywordDelimiter=" << QLatin1Char(m_cKeywordDelimiter) << "key=" << key << "typedString=" << typedString;
0112     }
0113 
0114     return provider;
0115 }
0116 
0117 SearchProvider *KURISearchFilterEngine::autoWebSearchQuery(const QString &typedString, const QString &defaultShortcut) const
0118 {
0119     SearchProvider *provider = nullptr;
0120     const QString defaultSearchProvider = (m_defaultWebShortcut.isEmpty() ? defaultShortcut : m_defaultWebShortcut);
0121 
0122     if (m_bWebShortcutsEnabled && !defaultSearchProvider.isEmpty()) {
0123         // Make sure we ignore supported protocols, e.g. "smb:", "http:"
0124         const int pos = typedString.indexOf(QLatin1Char(':'));
0125 
0126         if (pos == -1 || !KProtocolInfo::isKnownProtocol(typedString.left(pos))) {
0127             provider = m_registry.findByDesktopName(defaultSearchProvider);
0128         }
0129     }
0130 
0131     return provider;
0132 }
0133 
0134 QByteArray KURISearchFilterEngine::name() const
0135 {
0136     return "kuriikwsfilter";
0137 }
0138 
0139 char KURISearchFilterEngine::keywordDelimiter() const
0140 {
0141     return m_cKeywordDelimiter;
0142 }
0143 
0144 QString KURISearchFilterEngine::defaultSearchEngine() const
0145 {
0146     return m_defaultWebShortcut;
0147 }
0148 
0149 QStringList KURISearchFilterEngine::favoriteEngineList() const
0150 {
0151     return m_preferredWebShortcuts;
0152 }
0153 
0154 Q_GLOBAL_STATIC(KURISearchFilterEngine, sSelfPtr)
0155 
0156 KURISearchFilterEngine *KURISearchFilterEngine::self()
0157 {
0158     return sSelfPtr;
0159 }
0160 
0161 QStringList KURISearchFilterEngine::modifySubstitutionMap(SubstMap &map, const QString &query) const
0162 {
0163     // Returns the number of query words
0164     QString userquery = query;
0165 
0166     // Do some pre-encoding, before we can start the work:
0167     {
0168         const QRegularExpression qsexpr(QStringLiteral("\\\"[^\\\"]*\\\""));
0169         // Temporarily substitute spaces in quoted strings (" " -> "%20")
0170         // Needed to split user query into StringList correctly.
0171         int start = 0;
0172         QRegularExpressionMatch match;
0173         while ((match = qsexpr.match(userquery, start)).hasMatch()) {
0174             QString str = match.captured(0);
0175             str.replace(QLatin1Char(' '), QLatin1String("%20"));
0176             userquery.replace(match.capturedStart(0), match.capturedLength(0), str);
0177             start = match.capturedStart(0) + str.size(); // Move after last quote
0178         }
0179     }
0180 
0181     // Split user query between spaces:
0182     QStringList l = userquery.simplified().split(QLatin1Char(' '), Qt::SkipEmptyParts);
0183 
0184     // Back-substitute quoted strings (%20 -> " "):
0185     userquery.replace(QLatin1String("%20"), QLatin1String(" "));
0186     l.replaceInStrings(QStringLiteral("%20"), QStringLiteral(" "));
0187 
0188     qCDebug(category) << "Generating substitution map:\n";
0189     // Generate substitution map from user query:
0190     for (int i = 0; i <= l.count(); i++) {
0191         int pos = 0;
0192         QString v;
0193         QString nr = QString::number(i);
0194 
0195         // Add whole user query (\{0}) to substitution map:
0196         if (i == 0) {
0197             v = userquery;
0198         }
0199         // Add partial user query items to substitution map:
0200         else {
0201             v = l[i - 1];
0202         }
0203 
0204         // Insert partial queries (referenced by \1 ... \n) to map:
0205         map.insert(QString::number(i), v);
0206         kuriikws_debug(QLatin1String("  map['") + nr + QLatin1String("']"), map[nr]);
0207 
0208         // Insert named references (referenced by \name) to map:
0209         if ((i > 0) && (pos = v.indexOf(QLatin1Char('='))) > 0) {
0210             QString s = v.mid(pos + 1);
0211             QString k = v.left(pos);
0212 
0213             // Back-substitute references contained in references (e.g. '\refname' substitutes to 'thisquery=\0')
0214             s.replace(QLatin1String("%5C"), QLatin1String("\\"));
0215             map.insert(k, s);
0216             kuriikws_debug(QLatin1String("  map['") + k + QLatin1String("']"), map[k]);
0217         }
0218     }
0219 
0220     return l;
0221 }
0222 
0223 static QString encodeString(const QString &s, QTextCodec *codec)
0224 {
0225     // we encode all characters, including the space character BUG: 304276
0226     QByteArray encoded = codec->fromUnicode(s).toPercentEncoding();
0227     return QString::fromUtf8(encoded);
0228 }
0229 
0230 QString KURISearchFilterEngine::substituteQuery(const QString &url, SubstMap &map, const QString &userquery, QTextCodec *codec) const
0231 {
0232     QString newurl = url;
0233     QStringList ql = modifySubstitutionMap(map, userquery);
0234     const int count = ql.count();
0235 
0236     // Check, if old style '\1' is found and replace it with \{@} (compatibility mode):
0237     {
0238         int pos = -1;
0239         if ((pos = newurl.indexOf(QLatin1String("\\1"))) >= 0) {
0240             qCWarning(category) << "WARNING: Using compatibility mode for newurl='" << newurl
0241                                 << "'. Please replace old style '\\1' with new style '\\{0}' "
0242                                    "in the query definition.\n";
0243             newurl.replace(pos, 2, QStringLiteral("\\{@}"));
0244         }
0245     }
0246 
0247     qCDebug(category) << "Substitute references:\n";
0248     // Substitute references (\{ref1,ref2,...}) with values from user query:
0249     {
0250         const QRegularExpression reflistRe(QStringLiteral("\\\\\\{([^\\}]+)\\}"));
0251         // Substitute reflists (\{ref1,ref2,...}):
0252         int start = 0;
0253         QRegularExpressionMatch match;
0254         while ((match = reflistRe.match(newurl, start)).hasMatch()) {
0255             bool found = false;
0256 
0257             // bool rest = false;
0258             QString v;
0259             const QString rlstring = match.captured(1);
0260             kuriikws_debug(QStringLiteral("  reference list"), rlstring);
0261 
0262             // \{@} gets a special treatment later
0263             if (rlstring == QLatin1String("@")) {
0264                 v = QStringLiteral("\\@");
0265                 found = true;
0266             }
0267 
0268             // TODO: strip whitespaces around commas
0269             const QStringList refList = rlstring.split(QLatin1Char(','), Qt::SkipEmptyParts);
0270 
0271             for (const QString &rlitem : refList) {
0272                 if (found) {
0273                     break;
0274                 }
0275 
0276                 const QRegularExpression rangeRe(QStringLiteral("([0-9]*)\\-([0-9]*)"));
0277                 const QRegularExpressionMatch rangeMatch = rangeRe.match(rlitem);
0278                 // Substitute a range of keywords
0279                 if (rangeMatch.hasMatch()) {
0280                     int first = rangeMatch.captured(1).toInt();
0281                     int last = rangeMatch.captured(2).toInt();
0282 
0283                     if (first == 0) {
0284                         first = 1;
0285                     }
0286 
0287                     if (last == 0) {
0288                         last = count;
0289                     }
0290 
0291                     for (int i = first; i <= last; i++) {
0292                         v += map[QString::number(i)] + QLatin1Char(' ');
0293                         // Remove used value from ql (needed for \{@}):
0294                         ql[i - 1].clear();
0295                     }
0296 
0297                     v = v.trimmed();
0298                     if (!v.isEmpty()) {
0299                         found = true;
0300                     }
0301 
0302                     kuriikws_debug(QStringLiteral("    range"),
0303                                    QString::number(first) + QLatin1Char('-') + QString::number(last) + QLatin1String(" => '") + v + QLatin1Char('\''));
0304                     v = encodeString(v, codec);
0305                 } else if (rlitem.startsWith(QLatin1Char('\"')) && rlitem.endsWith(QLatin1Char('\"'))) {
0306                     // Use default string from query definition:
0307                     found = true;
0308                     QString s = rlitem.mid(1, rlitem.length() - 2);
0309                     v = encodeString(s, codec);
0310                     kuriikws_debug(QStringLiteral("    default"), s);
0311                 } else if (map.contains(rlitem)) {
0312                     // Use value from substitution map:
0313                     found = true;
0314                     kuriikws_debug(QLatin1String("    map['") + rlitem + QLatin1String("']"), map[rlitem]);
0315                     v = encodeString(map[rlitem], codec);
0316 
0317                     // Remove used value from ql (needed for \{@}):
0318                     const QChar c = rlitem.at(0); // rlitem can't be empty at this point
0319                     if (c == QLatin1Char('0')) {
0320                         // It's a numeric reference to '0'
0321                         for (QStringList::Iterator it = ql.begin(); it != ql.end(); ++it) {
0322                             (*it).clear();
0323                         }
0324                     } else if ((c >= QLatin1String("0")) && (c <= QLatin1String("9"))) { // krazy:excludeall=doublequote_chars
0325                         // It's a numeric reference > '0'
0326                         int n = rlitem.toInt();
0327                         ql[n - 1].clear();
0328                     } else {
0329                         // It's a alphanumeric reference
0330                         QStringList::Iterator it = ql.begin();
0331                         while ((it != ql.end()) && !it->startsWith(rlitem + QLatin1Char('='))) {
0332                             ++it;
0333                         }
0334                         if (it != ql.end()) {
0335                             it->clear();
0336                         }
0337                     }
0338 
0339                     // Encode '+', otherwise it would be interpreted as space in the resulting url:
0340                     v.replace(QLatin1Char('+'), QLatin1String("%2B"));
0341                 } else if (rlitem == QLatin1String("@")) {
0342                     v = QStringLiteral("\\@");
0343                     kuriikws_debug(QStringLiteral("    v"), v);
0344                 }
0345             }
0346 
0347             newurl.replace(match.capturedStart(0), match.capturedLength(0), v);
0348             start = match.capturedStart(0) + v.size();
0349         }
0350 
0351         // Special handling for \{@};
0352         {
0353             kuriikws_debug(QStringLiteral("  newurl"), newurl);
0354             // Generate list of unmatched strings:
0355             QString v = ql.join(QLatin1Char(' ')).simplified();
0356 
0357             kuriikws_debug(QStringLiteral("    rest"), v);
0358             v = encodeString(v, codec);
0359 
0360             // Substitute \{@} with list of unmatched query strings
0361             newurl.replace(QLatin1String("\\@"), v);
0362         }
0363     }
0364 
0365     return newurl;
0366 }
0367 
0368 QUrl KURISearchFilterEngine::formatResult(const QString &url, const QString &cset1, const QString &cset2, const QString &query, bool isMalformed) const
0369 {
0370     SubstMap map;
0371     return formatResult(url, cset1, cset2, query, isMalformed, map);
0372 }
0373 
0374 QUrl KURISearchFilterEngine::formatResult(const QString &url,
0375                                           const QString &cset1,
0376                                           const QString &cset2,
0377                                           const QString &userquery,
0378                                           bool /* isMalformed */,
0379                                           SubstMap &map) const
0380 {
0381     // Return nothing if userquery is empty and it contains
0382     // substitution strings...
0383     if (userquery.isEmpty() && url.indexOf(QLatin1String("\\{")) > 0) {
0384         return QUrl();
0385     }
0386 
0387     // Debug info of map:
0388     if (!map.isEmpty()) {
0389         qCDebug(category) << "Got non-empty substitution map:\n";
0390         for (SubstMap::Iterator it = map.begin(); it != map.end(); ++it) {
0391             kuriikws_debug(QLatin1String("    map['") + it.key() + QLatin1String("']"), it.value());
0392         }
0393     }
0394 
0395     // Create a codec for the desired encoding so that we can transcode the user's "url".
0396     QString cseta = cset1;
0397     if (cseta.isEmpty()) {
0398         cseta = QStringLiteral("UTF-8");
0399     }
0400 
0401     QTextCodec *csetacodec = QTextCodec::codecForName(cseta.toLatin1());
0402     if (!csetacodec) {
0403         cseta = QStringLiteral("UTF-8");
0404         csetacodec = QTextCodec::codecForName(cseta.toLatin1());
0405     }
0406 
0407     kuriikws_debug(QStringLiteral("user query"), userquery);
0408     kuriikws_debug(QStringLiteral("query definition"), url);
0409 
0410     // Add charset indicator for the query to substitution map:
0411     map.insert(QStringLiteral("ikw_charset"), cseta);
0412 
0413     // Add charset indicator for the fallback query to substitution map:
0414     QString csetb = cset2;
0415     if (csetb.isEmpty()) {
0416         csetb = QStringLiteral("UTF-8");
0417     }
0418     map.insert(QStringLiteral("wsc_charset"), csetb);
0419 
0420     QString newurl = substituteQuery(url, map, userquery, csetacodec);
0421 
0422     kuriikws_debug(QStringLiteral("substituted query"), newurl);
0423 
0424     return QUrl(newurl, QUrl::StrictMode);
0425 }
0426 
0427 void KURISearchFilterEngine::loadConfig()
0428 {
0429     qCDebug(category) << "Keywords Engine: Loading config...";
0430 
0431     // Load the config.
0432     KConfig config(QString::fromUtf8(name()) + QLatin1String("rc"), KConfig::NoGlobals);
0433     KConfigGroup group = config.group("General");
0434 
0435     m_cKeywordDelimiter = QString(group.readEntry("KeywordDelimiter", ":")).at(0).toLatin1();
0436     m_bWebShortcutsEnabled = group.readEntry("EnableWebShortcuts", true);
0437     m_defaultWebShortcut = group.readEntry("DefaultWebShortcut", "duckduckgo");
0438     m_bUseOnlyPreferredWebShortcuts = group.readEntry("UsePreferredWebShortcutsOnly", false);
0439 
0440     QStringList defaultPreferredShortcuts;
0441     if (!group.hasKey("PreferredWebShortcuts")) {
0442         defaultPreferredShortcuts = KURISearchFilterEngine::defaultSearchProviders();
0443     }
0444     m_preferredWebShortcuts = group.readEntry("PreferredWebShortcuts", defaultPreferredShortcuts);
0445 
0446     // Use either a white space or a : as the keyword delimiter...
0447     if (strchr(" :", m_cKeywordDelimiter) == nullptr) {
0448         m_cKeywordDelimiter = ':';
0449     }
0450 
0451     qCDebug(category) << "Web Shortcuts Enabled: " << m_bWebShortcutsEnabled;
0452     qCDebug(category) << "Default Shortcut: " << m_defaultWebShortcut;
0453     qCDebug(category) << "Keyword Delimiter: " << m_cKeywordDelimiter;
0454     m_registry.reload();
0455 }
0456 
0457 SearchProviderRegistry *KURISearchFilterEngine::registry()
0458 {
0459     return &m_registry;
0460 }