File indexing completed on 2024-12-08 04:33:11
0001 /* 0002 SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com> 0003 SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "ruqolaktexttohtml.h" 0009 #include "ruqolaktexttohtml_p.h" 0010 0011 #include <QRegularExpression> 0012 0013 #include <climits> 0014 0015 KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen) 0016 : mText(plainText) 0017 , mMaxUrlLen(maxUrlLen) 0018 , mMaxAddressLen(maxAddressLen) 0019 , mPos(pos) 0020 { 0021 } 0022 0023 QString KTextToHTMLHelper::getEmailAddress() 0024 { 0025 QString address; 0026 0027 if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) { 0028 // the following characters are allowed in a dot-atom (RFC 2822): 0029 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 0030 const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~"); 0031 0032 // determine the local part of the email address 0033 int start = mPos - 1; 0034 while (start >= 0 && mText.at(start).unicode() < 128 0035 && (mText.at(start).isLetterOrNumber() || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses 0036 || allowedSpecialChars.indexOf(mText.at(start)) != -1)) { 0037 if (mText.at(start) == QLatin1Char('@')) { 0038 return {}; // local part contains '@' -> no email address 0039 } 0040 --start; 0041 } 0042 ++start; 0043 // we assume that an email address starts with a letter or a digit 0044 while ((start < mPos) && !mText.at(start).isLetterOrNumber()) { 0045 ++start; 0046 } 0047 if (start == mPos) { 0048 return {}; // local part is empty -> no email address 0049 } 0050 0051 // determine the domain part of the email address 0052 int dotPos = INT_MAX; 0053 int end = mPos + 1; 0054 while (end < mText.length() 0055 && (mText.at(end).isLetterOrNumber() || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses 0056 || mText.at(end) == QLatin1Char('.') || mText.at(end) == QLatin1Char('-'))) { 0057 if (mText.at(end) == QLatin1Char('@')) { 0058 return {}; // domain part contains '@' -> no email address 0059 } 0060 if (mText.at(end) == QLatin1Char('.')) { 0061 dotPos = qMin(dotPos, end); // remember index of first dot in domain 0062 } 0063 ++end; 0064 } 0065 // we assume that an email address ends with a letter or a digit 0066 while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) { 0067 --end; 0068 } 0069 if (end == mPos) { 0070 return {}; // domain part is empty -> no email address 0071 } 0072 if (dotPos >= end) { 0073 return {}; // domain part doesn't contain a dot 0074 } 0075 0076 if (end - start > mMaxAddressLen) { 0077 return {}; // too long -> most likely no email address 0078 } 0079 address = mText.mid(start, end - start); 0080 0081 mPos = end - 1; 0082 } 0083 return address; 0084 } 0085 0086 QString KTextToHTMLHelper::getPhoneNumber() 0087 { 0088 if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) { 0089 return {}; 0090 } 0091 0092 const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:"); 0093 if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) { 0094 return {}; 0095 } 0096 0097 // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp 0098 static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})")); 0099 const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); 0100 if (match.hasMatch()) { 0101 auto m = match.captured(); 0102 // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan 0103 if (std::count_if(m.begin(), 0104 m.end(), 0105 [](QChar c) { 0106 return c.isDigit(); 0107 }) 0108 > 15) { 0109 return {}; 0110 } 0111 // only one / is allowed, otherwise we trigger on dates 0112 if (std::count(m.begin(), m.end(), QLatin1Char('/')) > 1) { 0113 return {}; 0114 } 0115 0116 // parenthesis need to be balanced, and must not be nested 0117 int openIdx = -1; 0118 for (int i = 0; i < m.size(); ++i) { 0119 if ((m[i] == QLatin1Char('(') && openIdx >= 0) || (m[i] == QLatin1Char(')') && openIdx < 0)) { 0120 return {}; 0121 } 0122 if (m[i] == QLatin1Char('(')) { 0123 openIdx = i; 0124 } else if (m[i] == QLatin1Char(')')) { 0125 openIdx = -1; 0126 } 0127 } 0128 if (openIdx > 0) { 0129 m = QStringView(m).left(openIdx - 1).trimmed().toString(); 0130 } 0131 0132 // check if there's a plausible separator at the end 0133 const QString allowedEndSeparators = QStringLiteral(" \r\t\n,."); 0134 const auto l = m.size(); 0135 if (mText.size() > mPos + l && !allowedEndSeparators.contains(mText.at(mPos + l))) { 0136 return {}; 0137 } 0138 0139 mPos += l - 1; 0140 return m; 0141 } 0142 return {}; 0143 } 0144 0145 static QString normalizePhoneNumber(const QString &str) 0146 { 0147 QString res; 0148 res.reserve(str.size()); 0149 for (const auto c : str) { 0150 if (c.isDigit() || c == QLatin1Char('+')) { 0151 res.push_back(c); 0152 } 0153 } 0154 return res; 0155 } 0156 0157 // The following characters are allowed in a dot-atom (RFC 2822): 0158 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 0159 static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~"; 0160 0161 bool KTextToHTMLHelper::atUrl() const 0162 { 0163 // The character directly before the URL must not be a letter, a number or 0164 // any other character allowed in a dot-atom (RFC 2822). 0165 if (mPos > 0) { 0166 const auto chBefore = mText.at(mPos - 1); 0167 if (chBefore.isLetterOrNumber() || QString::fromLatin1(s_allowedSpecialChars).contains(chBefore)) { 0168 return false; 0169 } 0170 } 0171 0172 const auto segment = QStringView(mText).mid(mPos); 0173 return segment.startsWith(QLatin1String("http://")) || segment.startsWith(QLatin1String("https://")) || segment.startsWith(QLatin1String("vnc://")) 0174 || segment.startsWith(QLatin1String("fish://")) || segment.startsWith(QLatin1String("ftp://")) || segment.startsWith(QLatin1String("ftps://")) 0175 || segment.startsWith(QLatin1String("sftp://")) || segment.startsWith(QLatin1String("smb://")) || segment.startsWith(QLatin1String("mailto:")) 0176 || segment.startsWith(QLatin1String("www.")) || segment.startsWith(QLatin1String("ftp.")) || segment.startsWith(QLatin1String("file://")) 0177 || segment.startsWith(QLatin1String("news:")) || segment.startsWith(QLatin1String("tel:")) || segment.startsWith(QLatin1String("xmpp:")) 0178 || segment.startsWith(QLatin1String("irc://")) || segment.startsWith(QLatin1String("ircs://")); 0179 } 0180 0181 bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const 0182 { 0183 return url.isEmpty() || url == QLatin1String("http://") || url == QLatin1String("https://") || url == QLatin1String("fish://") 0184 || url == QLatin1String("ftp://") || url == QLatin1String("ftps://") || url == QLatin1String("sftp://") || url == QLatin1String("smb://") 0185 || url == QLatin1String("vnc://") || url == QLatin1String("mailto") || url == QLatin1String("mailto:") || url == QLatin1String("www") 0186 || url == QLatin1String("ftp") || url == QLatin1String("news:") || url == QLatin1String("news://") || url == QLatin1String("tel") 0187 || url == QLatin1String("tel:") || url == QLatin1String("xmpp:") || url == QLatin1String("irc://") || url == QLatin1String("ircs://"); 0188 } 0189 0190 QString KTextToHTMLHelper::getUrl(bool *badurl) 0191 { 0192 QString url; 0193 if (atUrl()) { 0194 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C 0195 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall 0196 // be allowed and should be ignored when the URI is extracted. 0197 0198 // This implementation follows this recommendation and 0199 // allows the URL to be enclosed within different kind of brackets/quotes 0200 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise 0201 // the URL ends with the first whitespace 0202 // Also, if the URL is enclosed in brackets, the URL itself is not allowed 0203 // to contain the closing bracket, as this would be detected as the end of the URL 0204 0205 QChar beforeUrl; 0206 QChar afterUrl; 0207 0208 // detect if the url has been surrounded by brackets or quotes 0209 if (mPos > 0) { 0210 beforeUrl = mText.at(mPos - 1); 0211 0212 /* if ( beforeUrl == QLatin1Char('(') ) { 0213 afterUrl = QLatin1Char(')'); 0214 } else */ 0215 if (beforeUrl == QLatin1Char('[')) { 0216 afterUrl = QLatin1Char(']'); 0217 } else if (beforeUrl == QLatin1Char('<')) { 0218 afterUrl = QLatin1Char('>'); 0219 } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link> 0220 afterUrl = QLatin1Char('<'); 0221 } else if (beforeUrl == QLatin1Char('"')) { 0222 afterUrl = QLatin1Char('"'); 0223 } 0224 } 0225 url.reserve(mMaxUrlLen); // avoid allocs 0226 int start = mPos; 0227 bool previousCharIsSpace = false; 0228 bool previousCharIsADoubleQuote = false; 0229 bool previousIsAnAnchor = false; 0230 while ((mPos < mText.length()) && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace()) 0231 && ((afterUrl.isNull() && !mText.at(mPos).isSpace()) || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) { 0232 if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char('<')) && ((mPos + 1) < mText.length())) { 0233 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>" 0234 // < inside a URL is not allowed, however there is a test which 0235 // checks that "http://some<Host>/path" should be allowed 0236 // Therefore: check if what follows is another URL and if so, stop here 0237 mPos++; 0238 if (atUrl()) { 0239 mPos--; 0240 break; 0241 } 0242 mPos--; 0243 } 0244 if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) { 0245 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/" 0246 // Therefore: check if what follows is another URL and if so, stop here 0247 mPos++; 0248 if (atUrl()) { 0249 mPos--; 0250 break; 0251 } 0252 mPos--; 0253 } 0254 if (mText.at(mPos).isSpace()) { 0255 previousCharIsSpace = true; 0256 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) { 0257 break; 0258 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) { 0259 break; 0260 } else { // skip whitespace 0261 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) { 0262 url.append(QLatin1Char(' ')); 0263 break; 0264 } 0265 previousCharIsSpace = false; 0266 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) { 0267 // it's an invalid url 0268 if (badurl) { 0269 *badurl = true; 0270 } 0271 return {}; 0272 } 0273 if (mText.at(mPos) == QLatin1Char('"')) { 0274 previousCharIsADoubleQuote = true; 0275 } else { 0276 previousCharIsADoubleQuote = false; 0277 } 0278 if (mText.at(mPos) == QLatin1Char('#')) { 0279 previousIsAnAnchor = true; 0280 } 0281 url.append(mText.at(mPos)); 0282 if (url.length() > mMaxUrlLen) { 0283 break; 0284 } 0285 } 0286 0287 ++mPos; 0288 } 0289 0290 if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) { 0291 mPos = start; 0292 url.clear(); 0293 return url; 0294 } else { 0295 --mPos; 0296 } 0297 } 0298 0299 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in 0300 // their text with "" or <>. That leads to people writing an url, followed immediately by 0301 // a dot to finish the sentence. That would lead the parser to include the dot in the url, 0302 // even though that is not wanted. So work around that here. 0303 // Most real-life URLs hopefully don't end with dots or commas. 0304 QString wordBoundaries = QStringLiteral(".,:!?>"); 0305 bool hasOpenParenthese = url.contains(QLatin1Char('(')); 0306 if (!hasOpenParenthese) { 0307 wordBoundaries += QLatin1Char(')'); 0308 } 0309 if (url.length() > 1) { 0310 do { 0311 const QChar charact{url.at(url.length() - 1)}; 0312 if (wordBoundaries.contains(charact)) { 0313 url.chop(1); 0314 --mPos; 0315 } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) { 0316 if (url.length() > 2) { 0317 if (url.at(url.length() - 2) == QLatin1Char(')')) { 0318 url.chop(1); 0319 --mPos; 0320 hasOpenParenthese = false; 0321 } else { 0322 break; 0323 } 0324 } else { 0325 break; 0326 } 0327 } else { 0328 break; 0329 } 0330 } while (url.length() > 1); 0331 } 0332 return url; 0333 } 0334 0335 QString KTextToHTMLHelper::highlightedText() 0336 { 0337 // formatting symbols must be prepended with a whitespace 0338 if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) { 0339 return {}; 0340 } 0341 0342 const QChar ch = mText.at(mPos); 0343 if (ch != QLatin1Char('~') && ch != QLatin1Char('*') && ch != QLatin1Char('_')) { 0344 return {}; 0345 } 0346 0347 QRegularExpression re(QStringLiteral("\\%1+\\s*([^\\s|^\\%1].*[^\\s|^\\%1])\\s*\\%1+").arg(ch)); 0348 re.setPatternOptions(QRegularExpression::InvertedGreedinessOption); 0349 const auto match = re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); 0350 0351 if (match.hasMatch()) { 0352 if (match.capturedStart() == mPos) { 0353 const int length = match.capturedLength(); 0354 // there must be a whitespace after the closing formatting symbol 0355 if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) { 0356 return {}; 0357 } 0358 mPos += length - 1; 0359 switch (ch.toLatin1()) { 0360 case '*': 0361 return QStringLiteral("<b>") + match.capturedView(1).toString() + QStringLiteral("</b>"); 0362 case '_': 0363 return QStringLiteral("<i>") + match.capturedView(1).toString() + QStringLiteral("</i>"); 0364 case '~': 0365 return QStringLiteral("<s>") + match.capturedView(1).toString() + QStringLiteral("</s>"); 0366 } 0367 } 0368 } 0369 return {}; 0370 } 0371 0372 QString RuqolaKTextToHTML::convertToHtml(const QString &plainText, RuqolaKTextToHTML::Options flags, int maxUrlLen, int maxAddressLen) 0373 { 0374 KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen); 0375 0376 QString str; 0377 QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2); 0378 QChar ch; 0379 int x; 0380 bool startOfLine = true; 0381 0382 for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) { 0383 ch = helper.mText.at(helper.mPos); 0384 if (flags & PreserveSpaces) { 0385 if (ch == QLatin1Char(' ')) { 0386 if (helper.mPos + 1 < helper.mText.length()) { 0387 if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) { 0388 // A single space, make it breaking if not at the start or end of the line 0389 const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n'); 0390 if (!startOfLine && !endOfLine) { 0391 result += QLatin1Char(' '); 0392 } else { 0393 result += QLatin1String(" "); 0394 } 0395 } else { 0396 // Whitespace of more than one space, make it all non-breaking 0397 while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) { 0398 result += QLatin1String(" "); 0399 ++helper.mPos; 0400 ++x; 0401 } 0402 0403 // We incremented once to often, undo that 0404 --helper.mPos; 0405 --x; 0406 } 0407 } else { 0408 // Last space in the text, it is non-breaking 0409 result += QLatin1String(" "); 0410 } 0411 0412 if (startOfLine) { 0413 startOfLine = false; 0414 } 0415 continue; 0416 } else if (ch == QLatin1Char('\t')) { 0417 do { 0418 result += QLatin1String(" "); 0419 ++x; 0420 } while ((x & 7) != 0); 0421 --x; 0422 startOfLine = false; 0423 continue; 0424 } 0425 } 0426 if (ch == QLatin1Char('\n')) { 0427 result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly. 0428 startOfLine = true; 0429 x = -1; 0430 continue; 0431 } 0432 0433 startOfLine = false; 0434 if (ch == QLatin1Char('&')) { 0435 result += QLatin1String("&"); 0436 } else if (ch == QLatin1Char('"')) { 0437 result += QLatin1String("""); 0438 } else if (ch == QLatin1Char('<')) { 0439 result += QLatin1String("<"); 0440 } else if (ch == QLatin1Char('>')) { 0441 result += QLatin1String(">"); 0442 } else { 0443 const int start = helper.mPos; 0444 if (!(flags & IgnoreUrls)) { 0445 bool badUrl = false; 0446 str = helper.getUrl(&badUrl); 0447 if (badUrl) { 0448 QString resultBadUrl; 0449 const int helperTextSize(helper.mText.length()); 0450 for (int i = 0; i < helperTextSize; ++i) { 0451 const QChar chBadUrl = helper.mText.at(i); 0452 if (chBadUrl == QLatin1Char('&')) { 0453 resultBadUrl += QLatin1String("&"); 0454 } else if (chBadUrl == QLatin1Char('"')) { 0455 resultBadUrl += QLatin1String("""); 0456 } else if (chBadUrl == QLatin1Char('<')) { 0457 resultBadUrl += QLatin1String("<"); 0458 } else if (chBadUrl == QLatin1Char('>')) { 0459 resultBadUrl += QLatin1String(">"); 0460 } else { 0461 resultBadUrl += chBadUrl; 0462 } 0463 } 0464 return resultBadUrl; 0465 } 0466 if (!str.isEmpty()) { 0467 QString hyperlink; 0468 if (str.startsWith(QLatin1String("www."))) { 0469 hyperlink = QLatin1String("http://") + str; 0470 } else if (str.startsWith(QLatin1String("ftp."))) { 0471 hyperlink = QLatin1String("ftp://") + str; 0472 } else { 0473 hyperlink = str; 0474 } 0475 if (hyperlink.endsWith(QLatin1Char('"'))) { 0476 hyperlink.chop(1); 0477 } 0478 if (str.endsWith(QLatin1Char('"'))) { 0479 str.chop(1); 0480 } 0481 result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>"); 0482 x += helper.mPos - start; 0483 continue; 0484 } 0485 str = helper.getEmailAddress(); 0486 if (!str.isEmpty()) { 0487 // len is the length of the local part 0488 int len = str.indexOf(QLatin1Char('@')); 0489 QString localPart = str.left(len); 0490 0491 // remove the local part from the result (as '&'s have been expanded to 0492 // & we have to take care of the 4 additional characters per '&') 0493 result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4)); 0494 x -= len; 0495 0496 result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>"); 0497 x += str.length() - 1; 0498 continue; 0499 } 0500 if (flags & ConvertPhoneNumbers) { 0501 str = helper.getPhoneNumber(); 0502 if (!str.isEmpty()) { 0503 result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>"); 0504 x += str.length() - 1; 0505 continue; 0506 } 0507 } 0508 } 0509 if (flags & HighlightText) { 0510 str = helper.highlightedText(); 0511 if (!str.isEmpty()) { 0512 result += str; 0513 x += helper.mPos - start; 0514 continue; 0515 } 0516 } 0517 result += ch; 0518 } 0519 } 0520 0521 return result; 0522 }