File indexing completed on 2025-01-12 13:02:49
0001 /* 0002 SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com> 0003 SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "ruqolaktexttohtml.h" 0009 #include "ruqolaktexttohtml_p.h" 0010 0011 #include <QRegularExpression> 0012 0013 #include <climits> 0014 0015 KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen) 0016 : mText(plainText) 0017 , mMaxUrlLen(maxUrlLen) 0018 , mMaxAddressLen(maxAddressLen) 0019 , mPos(pos) 0020 { 0021 } 0022 0023 QString KTextToHTMLHelper::getEmailAddress() 0024 { 0025 QString address; 0026 0027 if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) { 0028 // the following characters are allowed in a dot-atom (RFC 2822): 0029 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 0030 const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~"); 0031 0032 // determine the local part of the email address 0033 int start = mPos - 1; 0034 while (start >= 0 && mText.at(start).unicode() < 128 0035 && (mText.at(start).isLetterOrNumber() || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses 0036 || allowedSpecialChars.indexOf(mText.at(start)) != -1)) { 0037 if (mText.at(start) == QLatin1Char('@')) { 0038 return {}; // local part contains '@' -> no email address 0039 } 0040 --start; 0041 } 0042 ++start; 0043 // we assume that an email address starts with a letter or a digit 0044 while ((start < mPos) && !mText.at(start).isLetterOrNumber()) { 0045 ++start; 0046 } 0047 if (start == mPos) { 0048 return {}; // local part is empty -> no email address 0049 } 0050 0051 // determine the domain part of the email address 0052 int dotPos = INT_MAX; 0053 int end = mPos + 1; 0054 while (end < mText.length() 0055 && (mText.at(end).isLetterOrNumber() || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses 0056 || mText.at(end) == QLatin1Char('.') || mText.at(end) == QLatin1Char('-'))) { 0057 if (mText.at(end) == QLatin1Char('@')) { 0058 return {}; // domain part contains '@' -> no email address 0059 } 0060 if (mText.at(end) == QLatin1Char('.')) { 0061 dotPos = qMin(dotPos, end); // remember index of first dot in domain 0062 } 0063 ++end; 0064 } 0065 // we assume that an email address ends with a letter or a digit 0066 while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) { 0067 --end; 0068 } 0069 if (end == mPos) { 0070 return {}; // domain part is empty -> no email address 0071 } 0072 if (dotPos >= end) { 0073 return {}; // domain part doesn't contain a dot 0074 } 0075 0076 if (end - start > mMaxAddressLen) { 0077 return {}; // too long -> most likely no email address 0078 } 0079 address = mText.mid(start, end - start); 0080 0081 mPos = end - 1; 0082 } 0083 return address; 0084 } 0085 0086 QString KTextToHTMLHelper::getPhoneNumber() 0087 { 0088 if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) { 0089 return {}; 0090 } 0091 0092 const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:"); 0093 if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) { 0094 return {}; 0095 } 0096 0097 // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp 0098 static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})")); 0099 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) 0100 const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption); 0101 #else 0102 const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); 0103 #endif 0104 if (match.hasMatch()) { 0105 auto m = match.captured(); 0106 // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan 0107 if (std::count_if(m.begin(), 0108 m.end(), 0109 [](QChar c) { 0110 return c.isDigit(); 0111 }) 0112 > 15) { 0113 return {}; 0114 } 0115 // only one / is allowed, otherwise we trigger on dates 0116 if (std::count(m.begin(), m.end(), QLatin1Char('/')) > 1) { 0117 return {}; 0118 } 0119 0120 // parenthesis need to be balanced, and must not be nested 0121 int openIdx = -1; 0122 for (int i = 0; i < m.size(); ++i) { 0123 if ((m[i] == QLatin1Char('(') && openIdx >= 0) || (m[i] == QLatin1Char(')') && openIdx < 0)) { 0124 return {}; 0125 } 0126 if (m[i] == QLatin1Char('(')) { 0127 openIdx = i; 0128 } else if (m[i] == QLatin1Char(')')) { 0129 openIdx = -1; 0130 } 0131 } 0132 if (openIdx > 0) { 0133 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) 0134 m = m.leftRef(openIdx - 1).trimmed().toString(); 0135 #else 0136 m = QStringView(m).left(openIdx - 1).trimmed().toString(); 0137 #endif 0138 } 0139 0140 // check if there's a plausible separator at the end 0141 const QString allowedEndSeparators = QStringLiteral(" \r\t\n,."); 0142 const auto l = m.size(); 0143 if (mText.size() > mPos + l && !allowedEndSeparators.contains(mText.at(mPos + l))) { 0144 return {}; 0145 } 0146 0147 mPos += l - 1; 0148 return m; 0149 } 0150 return {}; 0151 } 0152 0153 static QString normalizePhoneNumber(const QString &str) 0154 { 0155 QString res; 0156 res.reserve(str.size()); 0157 for (const auto c : str) { 0158 if (c.isDigit() || c == QLatin1Char('+')) { 0159 res.push_back(c); 0160 } 0161 } 0162 return res; 0163 } 0164 0165 // The following characters are allowed in a dot-atom (RFC 2822): 0166 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 0167 static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~"; 0168 0169 bool KTextToHTMLHelper::atUrl() const 0170 { 0171 // The character directly before the URL must not be a letter, a number or 0172 // any other character allowed in a dot-atom (RFC 2822). 0173 if (mPos > 0) { 0174 const auto chBefore = mText.at(mPos - 1); 0175 if (chBefore.isLetterOrNumber() || QString::fromLatin1(s_allowedSpecialChars).contains(chBefore)) { 0176 return false; 0177 } 0178 } 0179 0180 const auto segment = QStringView(mText).mid(mPos); 0181 return segment.startsWith(QLatin1String("http://")) || segment.startsWith(QLatin1String("https://")) || segment.startsWith(QLatin1String("vnc://")) 0182 || segment.startsWith(QLatin1String("fish://")) || segment.startsWith(QLatin1String("ftp://")) || segment.startsWith(QLatin1String("ftps://")) 0183 || segment.startsWith(QLatin1String("sftp://")) || segment.startsWith(QLatin1String("smb://")) || segment.startsWith(QLatin1String("mailto:")) 0184 || segment.startsWith(QLatin1String("www.")) || segment.startsWith(QLatin1String("ftp.")) || segment.startsWith(QLatin1String("file://")) 0185 || segment.startsWith(QLatin1String("news:")) || segment.startsWith(QLatin1String("tel:")) || segment.startsWith(QLatin1String("xmpp:")) 0186 || segment.startsWith(QLatin1String("irc://")) || segment.startsWith(QLatin1String("ircs://")); 0187 } 0188 0189 bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const 0190 { 0191 return url.isEmpty() || url == QLatin1String("http://") || url == QLatin1String("https://") || url == QLatin1String("fish://") 0192 || url == QLatin1String("ftp://") || url == QLatin1String("ftps://") || url == QLatin1String("sftp://") || url == QLatin1String("smb://") 0193 || url == QLatin1String("vnc://") || url == QLatin1String("mailto") || url == QLatin1String("mailto:") || url == QLatin1String("www") 0194 || url == QLatin1String("ftp") || url == QLatin1String("news:") || url == QLatin1String("news://") || url == QLatin1String("tel") 0195 || url == QLatin1String("tel:") || url == QLatin1String("xmpp:") || url == QLatin1String("irc://") || url == QLatin1String("ircs://"); 0196 } 0197 0198 QString KTextToHTMLHelper::getUrl(bool *badurl) 0199 { 0200 QString url; 0201 if (atUrl()) { 0202 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C 0203 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall 0204 // be allowed and should be ignored when the URI is extracted. 0205 0206 // This implementation follows this recommendation and 0207 // allows the URL to be enclosed within different kind of brackets/quotes 0208 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise 0209 // the URL ends with the first whitespace 0210 // Also, if the URL is enclosed in brackets, the URL itself is not allowed 0211 // to contain the closing bracket, as this would be detected as the end of the URL 0212 0213 QChar beforeUrl; 0214 QChar afterUrl; 0215 0216 // detect if the url has been surrounded by brackets or quotes 0217 if (mPos > 0) { 0218 beforeUrl = mText.at(mPos - 1); 0219 0220 /* if ( beforeUrl == QLatin1Char('(') ) { 0221 afterUrl = QLatin1Char(')'); 0222 } else */ 0223 if (beforeUrl == QLatin1Char('[')) { 0224 afterUrl = QLatin1Char(']'); 0225 } else if (beforeUrl == QLatin1Char('<')) { 0226 afterUrl = QLatin1Char('>'); 0227 } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link> 0228 afterUrl = QLatin1Char('<'); 0229 } else if (beforeUrl == QLatin1Char('"')) { 0230 afterUrl = QLatin1Char('"'); 0231 } 0232 } 0233 url.reserve(mMaxUrlLen); // avoid allocs 0234 int start = mPos; 0235 bool previousCharIsSpace = false; 0236 bool previousCharIsADoubleQuote = false; 0237 bool previousIsAnAnchor = false; 0238 while ((mPos < mText.length()) && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace()) 0239 && ((afterUrl.isNull() && !mText.at(mPos).isSpace()) || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) { 0240 if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char('<')) && ((mPos + 1) < mText.length())) { 0241 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>" 0242 // < inside a URL is not allowed, however there is a test which 0243 // checks that "http://some<Host>/path" should be allowed 0244 // Therefore: check if what follows is another URL and if so, stop here 0245 mPos++; 0246 if (atUrl()) { 0247 mPos--; 0248 break; 0249 } 0250 mPos--; 0251 } 0252 if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) { 0253 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/" 0254 // Therefore: check if what follows is another URL and if so, stop here 0255 mPos++; 0256 if (atUrl()) { 0257 mPos--; 0258 break; 0259 } 0260 mPos--; 0261 } 0262 if (mText.at(mPos).isSpace()) { 0263 previousCharIsSpace = true; 0264 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) { 0265 break; 0266 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) { 0267 break; 0268 } else { // skip whitespace 0269 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) { 0270 url.append(QLatin1Char(' ')); 0271 break; 0272 } 0273 previousCharIsSpace = false; 0274 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) { 0275 // it's an invalid url 0276 if (badurl) { 0277 *badurl = true; 0278 } 0279 return {}; 0280 } 0281 if (mText.at(mPos) == QLatin1Char('"')) { 0282 previousCharIsADoubleQuote = true; 0283 } else { 0284 previousCharIsADoubleQuote = false; 0285 } 0286 if (mText.at(mPos) == QLatin1Char('#')) { 0287 previousIsAnAnchor = true; 0288 } 0289 url.append(mText.at(mPos)); 0290 if (url.length() > mMaxUrlLen) { 0291 break; 0292 } 0293 } 0294 0295 ++mPos; 0296 } 0297 0298 if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) { 0299 mPos = start; 0300 url.clear(); 0301 return url; 0302 } else { 0303 --mPos; 0304 } 0305 } 0306 0307 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in 0308 // their text with "" or <>. That leads to people writing an url, followed immediately by 0309 // a dot to finish the sentence. That would lead the parser to include the dot in the url, 0310 // even though that is not wanted. So work around that here. 0311 // Most real-life URLs hopefully don't end with dots or commas. 0312 QString wordBoundaries = QStringLiteral(".,:!?>"); 0313 bool hasOpenParenthese = url.contains(QLatin1Char('(')); 0314 if (!hasOpenParenthese) { 0315 wordBoundaries += QLatin1Char(')'); 0316 } 0317 if (url.length() > 1) { 0318 do { 0319 const QChar charact{url.at(url.length() - 1)}; 0320 if (wordBoundaries.contains(charact)) { 0321 url.chop(1); 0322 --mPos; 0323 } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) { 0324 if (url.length() > 2) { 0325 if (url.at(url.length() - 2) == QLatin1Char(')')) { 0326 url.chop(1); 0327 --mPos; 0328 hasOpenParenthese = false; 0329 } else { 0330 break; 0331 } 0332 } else { 0333 break; 0334 } 0335 } else { 0336 break; 0337 } 0338 } while (url.length() > 1); 0339 } 0340 return url; 0341 } 0342 0343 QString KTextToHTMLHelper::highlightedText() 0344 { 0345 // formatting symbols must be prepended with a whitespace 0346 if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) { 0347 return {}; 0348 } 0349 0350 const QChar ch = mText.at(mPos); 0351 if (ch != QLatin1Char('~') && ch != QLatin1Char('*') && ch != QLatin1Char('_')) { 0352 return {}; 0353 } 0354 0355 QRegularExpression re(QStringLiteral("\\%1+\\s*([^\\s|^\\%1].*[^\\s|^\\%1])\\s*\\%1+").arg(ch)); 0356 re.setPatternOptions(QRegularExpression::InvertedGreedinessOption); 0357 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) 0358 const auto match = re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption); 0359 #else 0360 const auto match = re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); 0361 #endif 0362 0363 if (match.hasMatch()) { 0364 if (match.capturedStart() == mPos) { 0365 const int length = match.capturedLength(); 0366 // there must be a whitespace after the closing formatting symbol 0367 if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) { 0368 return {}; 0369 } 0370 mPos += length - 1; 0371 switch (ch.toLatin1()) { 0372 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) 0373 case '*': 0374 return QStringLiteral("<b>") + match.capturedRef(1) + QStringLiteral("</b>"); 0375 case '_': 0376 return QStringLiteral("<i>") + match.capturedRef(1) + QStringLiteral("</i>"); 0377 case '~': 0378 return QStringLiteral("<s>") + match.capturedRef(1) + QStringLiteral("</s>"); 0379 } 0380 #else 0381 case '*': 0382 return QStringLiteral("<b>") + match.capturedView(1).toString() + QStringLiteral("</b>"); 0383 case '_': 0384 return QStringLiteral("<i>") + match.capturedView(1).toString() + QStringLiteral("</i>"); 0385 case '~': 0386 return QStringLiteral("<s>") + match.capturedView(1).toString() + QStringLiteral("</s>"); 0387 } 0388 #endif 0389 } 0390 } 0391 return {}; 0392 } 0393 0394 QString RuqolaKTextToHTML::convertToHtml(const QString &plainText, RuqolaKTextToHTML::Options flags, int maxUrlLen, int maxAddressLen) 0395 { 0396 KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen); 0397 0398 QString str; 0399 QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2); 0400 QChar ch; 0401 int x; 0402 bool startOfLine = true; 0403 0404 for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) { 0405 ch = helper.mText.at(helper.mPos); 0406 if (flags & PreserveSpaces) { 0407 if (ch == QLatin1Char(' ')) { 0408 if (helper.mPos + 1 < helper.mText.length()) { 0409 if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) { 0410 // A single space, make it breaking if not at the start or end of the line 0411 const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n'); 0412 if (!startOfLine && !endOfLine) { 0413 result += QLatin1Char(' '); 0414 } else { 0415 result += QLatin1String(" "); 0416 } 0417 } else { 0418 // Whitespace of more than one space, make it all non-breaking 0419 while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) { 0420 result += QLatin1String(" "); 0421 ++helper.mPos; 0422 ++x; 0423 } 0424 0425 // We incremented once to often, undo that 0426 --helper.mPos; 0427 --x; 0428 } 0429 } else { 0430 // Last space in the text, it is non-breaking 0431 result += QLatin1String(" "); 0432 } 0433 0434 if (startOfLine) { 0435 startOfLine = false; 0436 } 0437 continue; 0438 } else if (ch == QLatin1Char('\t')) { 0439 do { 0440 result += QLatin1String(" "); 0441 ++x; 0442 } while ((x & 7) != 0); 0443 --x; 0444 startOfLine = false; 0445 continue; 0446 } 0447 } 0448 if (ch == QLatin1Char('\n')) { 0449 result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly. 0450 startOfLine = true; 0451 x = -1; 0452 continue; 0453 } 0454 0455 startOfLine = false; 0456 if (ch == QLatin1Char('&')) { 0457 result += QLatin1String("&"); 0458 } else if (ch == QLatin1Char('"')) { 0459 result += QLatin1String("""); 0460 } else if (ch == QLatin1Char('<')) { 0461 result += QLatin1String("<"); 0462 } else if (ch == QLatin1Char('>')) { 0463 result += QLatin1String(">"); 0464 } else { 0465 const int start = helper.mPos; 0466 if (!(flags & IgnoreUrls)) { 0467 bool badUrl = false; 0468 str = helper.getUrl(&badUrl); 0469 if (badUrl) { 0470 QString resultBadUrl; 0471 const int helperTextSize(helper.mText.length()); 0472 for (int i = 0; i < helperTextSize; ++i) { 0473 const QChar chBadUrl = helper.mText.at(i); 0474 if (chBadUrl == QLatin1Char('&')) { 0475 resultBadUrl += QLatin1String("&"); 0476 } else if (chBadUrl == QLatin1Char('"')) { 0477 resultBadUrl += QLatin1String("""); 0478 } else if (chBadUrl == QLatin1Char('<')) { 0479 resultBadUrl += QLatin1String("<"); 0480 } else if (chBadUrl == QLatin1Char('>')) { 0481 resultBadUrl += QLatin1String(">"); 0482 } else { 0483 resultBadUrl += chBadUrl; 0484 } 0485 } 0486 return resultBadUrl; 0487 } 0488 if (!str.isEmpty()) { 0489 QString hyperlink; 0490 if (str.startsWith(QLatin1String("www."))) { 0491 hyperlink = QLatin1String("http://") + str; 0492 } else if (str.startsWith(QLatin1String("ftp."))) { 0493 hyperlink = QLatin1String("ftp://") + str; 0494 } else { 0495 hyperlink = str; 0496 } 0497 if (hyperlink.endsWith(QLatin1Char('"'))) { 0498 hyperlink.chop(1); 0499 } 0500 if (str.endsWith(QLatin1Char('"'))) { 0501 str.chop(1); 0502 } 0503 result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>"); 0504 x += helper.mPos - start; 0505 continue; 0506 } 0507 str = helper.getEmailAddress(); 0508 if (!str.isEmpty()) { 0509 // len is the length of the local part 0510 int len = str.indexOf(QLatin1Char('@')); 0511 QString localPart = str.left(len); 0512 0513 // remove the local part from the result (as '&'s have been expanded to 0514 // & we have to take care of the 4 additional characters per '&') 0515 result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4)); 0516 x -= len; 0517 0518 result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>"); 0519 x += str.length() - 1; 0520 continue; 0521 } 0522 if (flags & ConvertPhoneNumbers) { 0523 str = helper.getPhoneNumber(); 0524 if (!str.isEmpty()) { 0525 result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>"); 0526 x += str.length() - 1; 0527 continue; 0528 } 0529 } 0530 } 0531 if (flags & HighlightText) { 0532 str = helper.highlightedText(); 0533 if (!str.isEmpty()) { 0534 result += str; 0535 x += helper.mPos - start; 0536 continue; 0537 } 0538 } 0539 result += ch; 0540 } 0541 } 0542 0543 return result; 0544 }