File indexing completed on 2024-04-28 03:53:49
0001 /* 0002 SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com> 0003 SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "ktexttohtml.h" 0009 #include "kemoticonsparser_p.h" 0010 #include "ktexttohtml_p.h" 0011 0012 #include <QCoreApplication> 0013 #include <QFile> 0014 #include <QRegularExpression> 0015 #include <QStringList> 0016 0017 #include <limits.h> 0018 0019 KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen) 0020 : mText(plainText) 0021 , mMaxUrlLen(maxUrlLen) 0022 , mMaxAddressLen(maxAddressLen) 0023 , mPos(pos) 0024 { 0025 } 0026 0027 QString KTextToHTMLHelper::getEmailAddress() 0028 { 0029 QString address; 0030 0031 if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) { 0032 // the following characters are allowed in a dot-atom (RFC 2822): 0033 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 0034 const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~"); 0035 0036 // determine the local part of the email address 0037 int start = mPos - 1; 0038 while (start >= 0 && mText.at(start).unicode() < 128 0039 && (mText.at(start).isLetterOrNumber() // 0040 || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses 0041 || allowedSpecialChars.indexOf(mText.at(start)) != -1)) { 0042 if (mText.at(start) == QLatin1Char('@')) { 0043 return QString(); // local part contains '@' -> no email address 0044 } 0045 --start; 0046 } 0047 ++start; 0048 // we assume that an email address starts with a letter or a digit 0049 while ((start < mPos) && !mText.at(start).isLetterOrNumber()) { 0050 ++start; 0051 } 0052 if (start == mPos) { 0053 return QString(); // local part is empty -> no email address 0054 } 0055 0056 // determine the domain part of the email address 0057 int dotPos = INT_MAX; 0058 int end = mPos + 1; 0059 while (end < mText.length() 0060 && (mText.at(end).isLetterOrNumber() // 0061 || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses 0062 || mText.at(end) == QLatin1Char('.') // 0063 || mText.at(end) == QLatin1Char('-'))) { 0064 if (mText.at(end) == QLatin1Char('@')) { 0065 return QString(); // domain part contains '@' -> no email address 0066 } 0067 if (mText.at(end) == QLatin1Char('.')) { 0068 dotPos = qMin(dotPos, end); // remember index of first dot in domain 0069 } 0070 ++end; 0071 } 0072 // we assume that an email address ends with a letter or a digit 0073 while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) { 0074 --end; 0075 } 0076 if (end == mPos) { 0077 return QString(); // domain part is empty -> no email address 0078 } 0079 if (dotPos >= end) { 0080 return QString(); // domain part doesn't contain a dot 0081 } 0082 0083 if (end - start > mMaxAddressLen) { 0084 return QString(); // too long -> most likely no email address 0085 } 0086 address = mText.mid(start, end - start); 0087 0088 mPos = end - 1; 0089 } 0090 return address; 0091 } 0092 0093 QString KTextToHTMLHelper::getPhoneNumber() 0094 { 0095 if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) { 0096 return {}; 0097 } 0098 0099 const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:"); 0100 if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) { 0101 return {}; 0102 } 0103 0104 // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp 0105 static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})")); 0106 const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); 0107 if (match.hasMatch()) { 0108 QStringView matchedText = match.capturedView(); 0109 // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan 0110 const int digitsCount = std::count_if(matchedText.cbegin(), matchedText.cend(), [](const QChar c) { 0111 return c.isDigit(); 0112 }); 0113 0114 if (digitsCount > 15) { 0115 return {}; 0116 } 0117 0118 // only one / is allowed, otherwise we trigger on dates 0119 if (matchedText.count(QLatin1Char('/')) > 1) { 0120 return {}; 0121 } 0122 0123 // parenthesis need to be balanced, and must not be nested 0124 int openIdx = -1; 0125 for (int i = 0, size = matchedText.size(); i < size; ++i) { 0126 const QChar ch = matchedText.at(i); 0127 if ((ch == QLatin1Char('(') && openIdx >= 0) || (ch == QLatin1Char(')') && openIdx < 0)) { 0128 return {}; 0129 } 0130 0131 if (ch == QLatin1Char('(')) { 0132 openIdx = i; 0133 } else if (ch == QLatin1Char(')')) { 0134 openIdx = -1; 0135 } 0136 } 0137 0138 if (openIdx > 0) { 0139 matchedText.truncate(openIdx - 1); 0140 matchedText = matchedText.trimmed(); 0141 } 0142 0143 // check if there's a plausible separator at the end 0144 const int matchedTextLength = matchedText.size(); 0145 const int endIdx = mPos + matchedTextLength; 0146 if (endIdx < mText.size() && !QStringView(u" \r\t\n,.").contains(mText.at(endIdx))) { 0147 return {}; 0148 } 0149 0150 mPos += matchedTextLength - 1; 0151 return matchedText.toString(); 0152 } 0153 return {}; 0154 } 0155 0156 static QString normalizePhoneNumber(const QString &str) 0157 { 0158 QString res; 0159 res.reserve(str.size()); 0160 for (const auto c : str) { 0161 if (c.isDigit() || c == QLatin1Char('+')) { 0162 res.push_back(c); 0163 } 0164 } 0165 return res; 0166 } 0167 0168 // The following characters are allowed in a dot-atom (RFC 2822): 0169 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 0170 static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~"; 0171 0172 bool KTextToHTMLHelper::atUrl() const 0173 { 0174 // The character directly before the URL must not be a letter, a number or 0175 // any other character allowed in a dot-atom (RFC 2822). 0176 if (mPos > 0) { 0177 const auto chBefore = mText.at(mPos - 1); 0178 if (chBefore.isLetterOrNumber() || QLatin1String(s_allowedSpecialChars).contains(chBefore)) { 0179 return false; 0180 } 0181 } 0182 0183 const auto segment = QStringView(mText).mid(mPos); 0184 /* clang-format off */ 0185 return segment.startsWith(QLatin1String("http://")) 0186 || segment.startsWith(QLatin1String("https://")) 0187 || segment.startsWith(QLatin1String("vnc://")) 0188 || segment.startsWith(QLatin1String("fish://")) 0189 || segment.startsWith(QLatin1String("ftp://")) 0190 || segment.startsWith(QLatin1String("ftps://")) 0191 || segment.startsWith(QLatin1String("sftp://")) 0192 || segment.startsWith(QLatin1String("smb://")) 0193 || segment.startsWith(QLatin1String("irc://")) 0194 || segment.startsWith(QLatin1String("ircs://")) 0195 || segment.startsWith(QLatin1String("mailto:")) 0196 || segment.startsWith(QLatin1String("www.")) 0197 || segment.startsWith(QLatin1String("ftp.")) 0198 || segment.startsWith(QLatin1String("file://")) 0199 || segment.startsWith(QLatin1String("news:")) 0200 || segment.startsWith(QLatin1String("tel:")) 0201 || segment.startsWith(QLatin1String("xmpp:")); 0202 /* clang-format on */ 0203 } 0204 0205 bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const 0206 { 0207 /* clang-format off */ 0208 return url.isEmpty() 0209 || url == QLatin1String("http://") 0210 || url == QLatin1String("https://") 0211 || url == QLatin1String("fish://") 0212 || url == QLatin1String("ftp://") 0213 || url == QLatin1String("ftps://") 0214 || url == QLatin1String("sftp://") 0215 || url == QLatin1String("smb://") 0216 || url == QLatin1String("vnc://") 0217 || url == QLatin1String("irc://") 0218 || url == QLatin1String("ircs://") 0219 || url == QLatin1String("mailto") 0220 || url == QLatin1String("mailto:") 0221 || url == QLatin1String("www") 0222 || url == QLatin1String("ftp") 0223 || url == QLatin1String("news:") 0224 || url == QLatin1String("news://") 0225 || url == QLatin1String("tel") 0226 || url == QLatin1String("tel:") 0227 || url == QLatin1String("xmpp:"); 0228 /* clang-format on */ 0229 } 0230 0231 QString KTextToHTMLHelper::getUrl(bool *badurl) 0232 { 0233 QString url; 0234 if (atUrl()) { 0235 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C 0236 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall 0237 // be allowed and should be ignored when the URI is extracted. 0238 0239 // This implementation follows this recommendation and 0240 // allows the URL to be enclosed within different kind of brackets/quotes 0241 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise 0242 // the URL ends with the first whitespace 0243 // Also, if the URL is enclosed in brackets, the URL itself is not allowed 0244 // to contain the closing bracket, as this would be detected as the end of the URL 0245 0246 QChar beforeUrl; 0247 QChar afterUrl; 0248 0249 // detect if the url has been surrounded by brackets or quotes 0250 if (mPos > 0) { 0251 beforeUrl = mText.at(mPos - 1); 0252 0253 /*if ( beforeUrl == '(' ) { 0254 afterUrl = ')'; 0255 } else */ 0256 if (beforeUrl == QLatin1Char('[')) { 0257 afterUrl = QLatin1Char(']'); 0258 } else if (beforeUrl == QLatin1Char('<')) { 0259 afterUrl = QLatin1Char('>'); 0260 } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link> 0261 afterUrl = QLatin1Char('<'); 0262 } else if (beforeUrl == QLatin1Char('"')) { 0263 afterUrl = QLatin1Char('"'); 0264 } 0265 } 0266 url.reserve(mMaxUrlLen); // avoid allocs 0267 int start = mPos; 0268 bool previousCharIsSpace = false; 0269 bool previousCharIsADoubleQuote = false; 0270 bool previousIsAnAnchor = false; 0271 /* clang-format off */ 0272 while (mPos < mText.length() // 0273 && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace()) 0274 && ((afterUrl.isNull() && !mText.at(mPos).isSpace()) 0275 || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) { 0276 if (!previousCharIsSpace 0277 && mText.at(mPos) == QLatin1Char('<') 0278 && (mPos + 1) < mText.length()) { /* clang-format on */ 0279 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>" 0280 // < inside a URL is not allowed, however there is a test which 0281 // checks that "http://some<Host>/path" should be allowed 0282 // Therefore: check if what follows is another URL and if so, stop here 0283 mPos++; 0284 if (atUrl()) { 0285 mPos--; 0286 break; 0287 } 0288 mPos--; 0289 } 0290 if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) { 0291 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/" 0292 // Therefore: check if what follows is another URL and if so, stop here 0293 mPos++; 0294 if (atUrl()) { 0295 mPos--; 0296 break; 0297 } 0298 mPos--; 0299 } 0300 if (mText.at(mPos).isSpace()) { 0301 previousCharIsSpace = true; 0302 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) { 0303 break; 0304 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) { 0305 break; 0306 } else { // skip whitespace 0307 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) { 0308 url.append(QLatin1Char(' ')); 0309 break; 0310 } 0311 previousCharIsSpace = false; 0312 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) { 0313 // it's an invalid url 0314 if (badurl) { 0315 *badurl = true; 0316 } 0317 return QString(); 0318 } 0319 if (mText.at(mPos) == QLatin1Char('"')) { 0320 previousCharIsADoubleQuote = true; 0321 } else { 0322 previousCharIsADoubleQuote = false; 0323 } 0324 if (mText.at(mPos) == QLatin1Char('#')) { 0325 previousIsAnAnchor = true; 0326 } 0327 url.append(mText.at(mPos)); 0328 if (url.length() > mMaxUrlLen) { 0329 break; 0330 } 0331 } 0332 0333 ++mPos; 0334 } 0335 0336 if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) { 0337 mPos = start; 0338 url.clear(); 0339 return url; 0340 } else { 0341 --mPos; 0342 } 0343 } 0344 0345 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in 0346 // their text with "" or <>. That leads to people writing an url, followed immediately by 0347 // a dot to finish the sentence. That would lead the parser to include the dot in the url, 0348 // even though that is not wanted. So work around that here. 0349 // Most real-life URLs hopefully don't end with dots or commas. 0350 QString wordBoundaries = QStringLiteral(".,:!?>"); 0351 bool hasOpenParenthese = url.contains(QLatin1Char('(')); 0352 if (!hasOpenParenthese) { 0353 wordBoundaries += QLatin1Char(')'); 0354 } 0355 0356 if (url.length() > 1) { 0357 do { 0358 const QChar charact{url.at(url.length() - 1)}; 0359 if (wordBoundaries.contains(charact)) { 0360 url.chop(1); 0361 --mPos; 0362 } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) { 0363 if (url.length() > 2) { 0364 if (url.at(url.length() - 2) == QLatin1Char(')')) { 0365 url.chop(1); 0366 --mPos; 0367 hasOpenParenthese = false; 0368 } else { 0369 break; 0370 } 0371 } else { 0372 break; 0373 } 0374 } else { 0375 break; 0376 } 0377 } while (url.length() > 1); 0378 } 0379 return url; 0380 } 0381 0382 QString KTextToHTMLHelper::highlightedText() 0383 { 0384 // formating symbols must be prepended with a whitespace 0385 if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) { 0386 return QString(); 0387 } 0388 0389 const QChar ch = mText.at(mPos); 0390 if (ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-')) { 0391 return QString(); 0392 } 0393 0394 const QRegularExpression re(QStringLiteral("\\%1([^\\s|^\\%1].*[^\\s|^\\%1])\\%1").arg(ch), QRegularExpression::InvertedGreedinessOption); 0395 const auto match = 0396 re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); // clazy:exclude=use-static-qregularexpression 0397 0398 if (match.hasMatch()) { 0399 if (match.capturedStart() == mPos) { 0400 int length = match.capturedLength(); 0401 // there must be a whitespace after the closing formating symbol 0402 if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) { 0403 return QString(); 0404 } 0405 mPos += length - 1; 0406 switch (ch.toLatin1()) { 0407 case '*': 0408 return QLatin1String("<b>*") + match.capturedView(1) + QLatin1String("*</b>"); 0409 case '_': 0410 return QLatin1String("<u>_") + match.capturedView(1) + QLatin1String("_</u>"); 0411 case '/': 0412 return QLatin1String("<i>/") + match.capturedView(1) + QLatin1String("/</i>"); 0413 case '-': 0414 return QLatin1String("<s>-") + match.capturedView(1) + QLatin1String("-</s>"); 0415 } 0416 } 0417 } 0418 return QString(); 0419 } 0420 0421 QString KTextToHTML::convertToHtml(const QString &plainText, const KTextToHTML::Options &flags, int maxUrlLen, int maxAddressLen) 0422 { 0423 KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen); 0424 0425 QString str; 0426 QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2); 0427 QChar ch; 0428 int x; 0429 bool startOfLine = true; 0430 0431 for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) { 0432 ch = helper.mText.at(helper.mPos); 0433 if (flags & PreserveSpaces) { 0434 if (ch == QLatin1Char(' ')) { 0435 if (helper.mPos + 1 < helper.mText.length()) { 0436 if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) { 0437 // A single space, make it breaking if not at the start or end of the line 0438 const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n'); 0439 if (!startOfLine && !endOfLine) { 0440 result += QLatin1Char(' '); 0441 } else { 0442 result += QLatin1String(" "); 0443 } 0444 } else { 0445 // Whitespace of more than one space, make it all non-breaking 0446 while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) { 0447 result += QLatin1String(" "); 0448 ++helper.mPos; 0449 ++x; 0450 } 0451 0452 // We incremented once to often, undo that 0453 --helper.mPos; 0454 --x; 0455 } 0456 } else { 0457 // Last space in the text, it is non-breaking 0458 result += QLatin1String(" "); 0459 } 0460 0461 if (startOfLine) { 0462 startOfLine = false; 0463 } 0464 continue; 0465 } else if (ch == QLatin1Char('\t')) { 0466 do { 0467 result += QLatin1String(" "); 0468 ++x; 0469 } while ((x & 7) != 0); 0470 --x; 0471 startOfLine = false; 0472 continue; 0473 } 0474 } 0475 if (ch == QLatin1Char('\n')) { 0476 result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly. 0477 startOfLine = true; 0478 x = -1; 0479 continue; 0480 } 0481 0482 startOfLine = false; 0483 if (ch == QLatin1Char('&')) { 0484 result += QLatin1String("&"); 0485 } else if (ch == QLatin1Char('"')) { 0486 result += QLatin1String("""); 0487 } else if (ch == QLatin1Char('<')) { 0488 result += QLatin1String("<"); 0489 } else if (ch == QLatin1Char('>')) { 0490 result += QLatin1String(">"); 0491 } else { 0492 const int start = helper.mPos; 0493 if (!(flags & IgnoreUrls)) { 0494 bool badUrl = false; 0495 str = helper.getUrl(&badUrl); 0496 if (badUrl) { 0497 QString resultBadUrl; 0498 for (const QChar chBadUrl : std::as_const(helper.mText)) { 0499 if (chBadUrl == QLatin1Char('&')) { 0500 resultBadUrl += QLatin1String("&"); 0501 } else if (chBadUrl == QLatin1Char('"')) { 0502 resultBadUrl += QLatin1String("""); 0503 } else if (chBadUrl == QLatin1Char('<')) { 0504 resultBadUrl += QLatin1String("<"); 0505 } else if (chBadUrl == QLatin1Char('>')) { 0506 resultBadUrl += QLatin1String(">"); 0507 } else { 0508 resultBadUrl += chBadUrl; 0509 } 0510 } 0511 return resultBadUrl; 0512 } 0513 if (!str.isEmpty()) { 0514 QString hyperlink; 0515 if (str.startsWith(QLatin1String("www."))) { 0516 hyperlink = QLatin1String("http://") + str; 0517 } else if (str.startsWith(QLatin1String("ftp."))) { 0518 hyperlink = QLatin1String("ftp://") + str; 0519 } else { 0520 hyperlink = str; 0521 } 0522 result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>"); 0523 x += helper.mPos - start; 0524 continue; 0525 } 0526 str = helper.getEmailAddress(); 0527 if (!str.isEmpty()) { 0528 // len is the length of the local part 0529 int len = str.indexOf(QLatin1Char('@')); 0530 QString localPart = str.left(len); 0531 0532 // remove the local part from the result (as '&'s have been expanded to 0533 // & we have to take care of the 4 additional characters per '&') 0534 result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4)); 0535 x -= len; 0536 0537 result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>"); 0538 x += str.length() - 1; 0539 continue; 0540 } 0541 if (flags & ConvertPhoneNumbers) { 0542 str = helper.getPhoneNumber(); 0543 if (!str.isEmpty()) { 0544 result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>"); 0545 x += str.length() - 1; 0546 continue; 0547 } 0548 } 0549 } 0550 if (flags & HighlightText) { 0551 str = helper.highlightedText(); 0552 if (!str.isEmpty()) { 0553 result += str; 0554 x += helper.mPos - start; 0555 continue; 0556 } 0557 } 0558 result += ch; 0559 } 0560 } 0561 0562 if (flags & ReplaceSmileys) { 0563 result = KEmoticonsParser::parseEmoticons(result); 0564 } 0565 0566 return result; 0567 }