File indexing completed on 2024-05-12 09:56:58
0001 /* 0002 SPDX-FileCopyrightText: 2007-2008 Robert Knight <robertknight@gmail.com> 0003 SPDX-FileCopyrightText: 2020 Tomaz Canabrava <tcanabrava@gmail.com> 0004 0005 SPDX-License-Identifier: GPL-2.0-or-later 0006 */ 0007 0008 #include "UrlFilter.h" 0009 0010 using namespace Konsole; 0011 0012 #include "UrlFilterHotspot.h" 0013 0014 // Note: Altering these regular expressions can have a major effect on the performance of the filters 0015 // used for finding URLs in the text, especially if they are very general and could match very long 0016 // pieces of text. 0017 // Please be careful when altering them. 0018 0019 // FullUrlRegExp is implemented based on: 0020 // https://datatracker.ietf.org/doc/html/rfc3986 0021 // See above URL for what "unreserved", "pct-encoded" ...etc mean, also 0022 // for the regex used for each part of the url being matched against. 0023 // 0024 // It deviates from rfc3986: 0025 // - We only recognize URIs with authority (even if it is an empty authority) 0026 // - We match URI suffixes starting with 'www.' 0027 // - We allow IPv6 literals right after 'www.', e.g: www.[dead::beef] 0028 // - We _don't_ match IPvFuture addresses 0029 // - We allow any combination of hex digits, colons and dots as IPv6 addresses, 0030 // e.g: https://[::::dead:::beef::123.666.666.666::dead::::beef::::]/foo 0031 // - "port" (':1234'), if present, is assumed to be non-empty 0032 // - We don't check the validity of percent-encoded characters 0033 // (e.g. "www.example.com/foo%XXbar") 0034 // - We do not allow parenthesis in host. 0035 // - We don't recognize URIs with unbalanced parens in path, query or fragment. 0036 // We do this to prevent URIs inside parentheses from getting extended to the closing 0037 // parenthesis. We still recognize unbalanced parens in userInfo, but the 0038 // postfix @ should prevent most ambiguity. 0039 0040 // All non-recursive () groups are non-capturing (by using "(?:)" notation) 0041 // less bookkeeping on the PCRE engine side 0042 0043 // scheme:// 0044 // - Must start with an ASCII letter, preceded by any non-word character, 0045 // so "http" but not "mhttp" 0046 static const char scheme_or_www[] = "\\b(?:www\\.|[a-z][a-z0-9+\\-.]*+://"; 0047 static const char scheme_or_www_end[] = ")"; 0048 0049 // unreserved / pct-encoded / sub-delims 0050 #define COMMON_1 "a-z0-9\\-._~%!$&'*+,;=" 0051 #define BALANCED_PARENS(CHARS) "(?:[" CHARS "]++(\\((?:[" CHARS "]++|(?-1))*+\\))?+)" 0052 0053 /* clang-format off */ 0054 static const char userInfo[] = "(?:[" COMMON_1 ":()" "]++@)?+"; // user:password@ 0055 #define IPv6_literal "\\[[0-9a-fA-F:.]++\\]" 0056 static const char host[] = "(?:[" COMMON_1 "]++|" IPv6_literal ")?+"; // www.foo.bar 0057 static const char port[] = "(?::[0-9]+)?+"; // :1234 0058 0059 #define COMMON_2 "a-z0-9\\-._~%!$&'*+,;=:@/" 0060 static const char path[] = "(?:/" BALANCED_PARENS(COMMON_2) "*+)?+"; // /path/to/some/place 0061 static const char query[] = "(?:\\?" BALANCED_PARENS(COMMON_2 "?") "*+)?+"; // "?somequery=bar" 0062 static const char fragment[] = "(?:#" BALANCED_PARENS(COMMON_2 "?") "*+)?+"; // "#fragment" 0063 0064 using LS1 = QLatin1String; 0065 0066 const QRegularExpression UrlFilter::FullUrlRegExp( 0067 LS1(scheme_or_www) 0068 + LS1(userInfo) 0069 + LS1(scheme_or_www_end) 0070 + LS1(host) 0071 + LS1(port) 0072 + LS1(path) 0073 + LS1(query) 0074 + LS1(fragment) 0075 , QRegularExpression::CaseInsensitiveOption); 0076 0077 0078 ///////////////////////////////////////////// 0079 0080 // email address: 0081 // [word chars, dots or dashes]@[word chars, dots or dashes].[word chars] 0082 const QRegularExpression UrlFilter::EmailAddressRegExp(QStringLiteral("\\b(\\w|\\.|-|\\+)+@(\\w|\\.|-)+\\.\\w+\\b")); 0083 0084 // matches full url or email address 0085 const QRegularExpression UrlFilter::CompleteUrlRegExp( 0086 QLatin1Char('(') + FullUrlRegExp.pattern() + QLatin1Char('|') + EmailAddressRegExp.pattern()+ QLatin1Char(')'), 0087 QRegularExpression::CaseInsensitiveOption); 0088 0089 /* clang-format on */ 0090 UrlFilter::UrlFilter() 0091 { 0092 setRegExp(CompleteUrlRegExp); 0093 } 0094 0095 QSharedPointer<HotSpot> UrlFilter::newHotSpot(int startLine, int startColumn, int endLine, int endColumn, const QStringList &capturedTexts) 0096 { 0097 QStringList texts{}; 0098 0099 // remove final single quote 0100 // we want URLs in single quotes like the following to work correctly: 0101 // 'https://en.wikipedia.org/wiki/Earth's_rotation' 0102 for (QString s : capturedTexts) { 0103 QString str{s}; 0104 if (s.right(1) == QLatin1String("'")) { 0105 s.chop(1); 0106 } 0107 texts << s; 0108 } 0109 if (capturedTexts[0].right(1) == QLatin1String("'")) { 0110 endColumn--; 0111 } 0112 0113 return QSharedPointer<HotSpot>(new UrlFilterHotSpot(startLine, startColumn, endLine, endColumn, texts)); 0114 }