File indexing completed on 2024-05-19 05:54:10

0001 /*
0002     SPDX-FileCopyrightText: 2007-2008 Robert Knight <robertknight@gmail.com>
0003     SPDX-FileCopyrightText: 2020 Tomaz Canabrava <tcanabrava@gmail.com>
0004 
0005     SPDX-License-Identifier: GPL-2.0-or-later
0006 */
0007 
0008 #include "UrlFilter.h"
0009 
0010 using namespace Konsole;
0011 
0012 #include "UrlFilterHotspot.h"
0013 
0014 // Note:  Altering these regular expressions can have a major effect on the performance of the filters
0015 // used for finding URLs in the text, especially if they are very general and could match very long
0016 // pieces of text.
0017 // Please be careful when altering them.
0018 
0019 // FullUrlRegExp is implemented based on:
0020 // https://datatracker.ietf.org/doc/html/rfc3986
0021 // See above URL for what "unreserved", "pct-encoded" ...etc mean, also
0022 // for the regex used for each part of the url being matched against.
0023 //
0024 // It deviates from rfc3986:
0025 // - We only recognize URIs with authority (even if it is an empty authority)
0026 // - We match URI suffixes starting with 'www.'
0027 // - We allow IPv6 literals right after 'www.', e.g: www.[dead::beef]
0028 // - We _don't_ match IPvFuture addresses
0029 // - We allow any combination of hex digits, colons and dots as IPv6 addresses,
0030 //   e.g: https://[::::dead:::beef::123.666.666.666::dead::::beef::::]/foo
0031 // - "port" (':1234'), if present, is assumed to be non-empty
0032 // - We don't check the validity of percent-encoded characters
0033 //   (e.g. "www.example.com/foo%XXbar")
0034 // - We do not allow parenthesis in host.
0035 // - We don't recognize URIs with unbalanced parens in path, query or fragment.
0036 //   We do this to prevent URIs inside parentheses from getting extended to the closing
0037 //   parenthesis.  We still recognize unbalanced parens in userInfo, but the
0038 //   postfix @ should prevent most ambiguity.
0039 
0040 // All non-recursive () groups are non-capturing (by using "(?:)" notation)
0041 // less bookkeeping on the PCRE engine side
0042 
0043 // scheme://
0044 // - Must start with an ASCII letter, preceded by any non-word character,
0045 //   so "http" but not "mhttp"
0046 static const char scheme_or_www[] = "\\b(?:www\\.|[a-z][a-z0-9+\\-.]*+://";
0047 static const char scheme_or_www_end[] = ")";
0048 
0049 // unreserved / pct-encoded / sub-delims
0050 #define COMMON_1 "a-z0-9\\-._~%!$&'*+,;="
0051 #define BALANCED_PARENS(CHARS) "(?:[" CHARS "]++(\\((?:[" CHARS "]++|(?-1))*+\\))?+)"
0052 
0053 /* clang-format off */
0054 static const char userInfo[] = "(?:[" COMMON_1 ":()" "]++@)?+"; // user:password@
0055 #define IPv6_literal "\\[[0-9a-fA-F:.]++\\]"
0056 static const char host[] = "(?:[" COMMON_1 "]++|" IPv6_literal ")?+"; // www.foo.bar
0057 static const char port[] = "(?::[0-9]+)?+"; // :1234
0058 
0059 #define COMMON_2 "a-z0-9\\-._~%!$&'*+,;=:@/"
0060 static const char path[] = "(?:/" BALANCED_PARENS(COMMON_2) "*+)?+"; // /path/to/some/place
0061 static const char query[] = "(?:\\?" BALANCED_PARENS(COMMON_2 "?") "*+)?+"; // "?somequery=bar"
0062 static const char fragment[] = "(?:#" BALANCED_PARENS(COMMON_2 "?") "*+)?+"; // "#fragment"
0063 
0064 using LS1 = QLatin1String;
0065 
0066 const QRegularExpression UrlFilter::FullUrlRegExp(
0067     LS1(scheme_or_www)
0068     + LS1(userInfo)
0069     + LS1(scheme_or_www_end)
0070     + LS1(host)
0071     + LS1(port)
0072     + LS1(path)
0073     + LS1(query)
0074     + LS1(fragment)
0075     , QRegularExpression::CaseInsensitiveOption);
0076 
0077 
0078 /////////////////////////////////////////////
0079 
0080 // email address:
0081 // [word chars, dots or dashes]@[word chars, dots or dashes].[word chars]
0082 const QRegularExpression UrlFilter::EmailAddressRegExp(QStringLiteral("\\b(\\w|\\.|-|\\+)+@(\\w|\\.|-)+\\.\\w+\\b"));
0083 
0084 // matches full url or email address
0085 const QRegularExpression UrlFilter::CompleteUrlRegExp(
0086     QLatin1Char('(') + FullUrlRegExp.pattern() + QLatin1Char('|') + EmailAddressRegExp.pattern()+ QLatin1Char(')'),
0087     QRegularExpression::CaseInsensitiveOption);
0088 
0089 /* clang-format on */
0090 UrlFilter::UrlFilter()
0091 {
0092     setRegExp(CompleteUrlRegExp);
0093 }
0094 
0095 QSharedPointer<HotSpot> UrlFilter::newHotSpot(int startLine, int startColumn, int endLine, int endColumn, const QStringList &capturedTexts)
0096 {
0097     QStringList texts{};
0098 
0099     // remove final single quote
0100     // we want URLs in single quotes like the following to work correctly:
0101     // 'https://en.wikipedia.org/wiki/Earth's_rotation'
0102     for (QString s : capturedTexts) {
0103         QString str{s};
0104         if (s.right(1) == QLatin1String("'")) {
0105             s.chop(1);
0106         }
0107         texts << s;
0108     }
0109     if (capturedTexts[0].right(1) == QLatin1String("'")) {
0110         endColumn--;
0111     }
0112 
0113     return QSharedPointer<HotSpot>(new UrlFilterHotSpot(startLine, startColumn, endLine, endColumn, texts));
0114 }