File indexing completed on 2024-04-28 03:53:49

0001 /*
0002     SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com>
0003     SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #include "ktexttohtml.h"
0009 #include "kemoticonsparser_p.h"
0010 #include "ktexttohtml_p.h"
0011 
0012 #include <QCoreApplication>
0013 #include <QFile>
0014 #include <QRegularExpression>
0015 #include <QStringList>
0016 
0017 #include <limits.h>
0018 
0019 KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen)
0020     : mText(plainText)
0021     , mMaxUrlLen(maxUrlLen)
0022     , mMaxAddressLen(maxAddressLen)
0023     , mPos(pos)
0024 {
0025 }
0026 
0027 QString KTextToHTMLHelper::getEmailAddress()
0028 {
0029     QString address;
0030 
0031     if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) {
0032         // the following characters are allowed in a dot-atom (RFC 2822):
0033         // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
0034         const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~");
0035 
0036         // determine the local part of the email address
0037         int start = mPos - 1;
0038         while (start >= 0 && mText.at(start).unicode() < 128
0039                && (mText.at(start).isLetterOrNumber() //
0040                    || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses
0041                    || allowedSpecialChars.indexOf(mText.at(start)) != -1)) {
0042             if (mText.at(start) == QLatin1Char('@')) {
0043                 return QString(); // local part contains '@' -> no email address
0044             }
0045             --start;
0046         }
0047         ++start;
0048         // we assume that an email address starts with a letter or a digit
0049         while ((start < mPos) && !mText.at(start).isLetterOrNumber()) {
0050             ++start;
0051         }
0052         if (start == mPos) {
0053             return QString(); // local part is empty -> no email address
0054         }
0055 
0056         // determine the domain part of the email address
0057         int dotPos = INT_MAX;
0058         int end = mPos + 1;
0059         while (end < mText.length()
0060                && (mText.at(end).isLetterOrNumber() //
0061                    || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses
0062                    || mText.at(end) == QLatin1Char('.') //
0063                    || mText.at(end) == QLatin1Char('-'))) {
0064             if (mText.at(end) == QLatin1Char('@')) {
0065                 return QString(); // domain part contains '@' -> no email address
0066             }
0067             if (mText.at(end) == QLatin1Char('.')) {
0068                 dotPos = qMin(dotPos, end); // remember index of first dot in domain
0069             }
0070             ++end;
0071         }
0072         // we assume that an email address ends with a letter or a digit
0073         while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) {
0074             --end;
0075         }
0076         if (end == mPos) {
0077             return QString(); // domain part is empty -> no email address
0078         }
0079         if (dotPos >= end) {
0080             return QString(); // domain part doesn't contain a dot
0081         }
0082 
0083         if (end - start > mMaxAddressLen) {
0084             return QString(); // too long -> most likely no email address
0085         }
0086         address = mText.mid(start, end - start);
0087 
0088         mPos = end - 1;
0089     }
0090     return address;
0091 }
0092 
0093 QString KTextToHTMLHelper::getPhoneNumber()
0094 {
0095     if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) {
0096         return {};
0097     }
0098 
0099     const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:");
0100     if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) {
0101         return {};
0102     }
0103 
0104     // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp
0105     static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})"));
0106     const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption);
0107     if (match.hasMatch()) {
0108         QStringView matchedText = match.capturedView();
0109         // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan
0110         const int digitsCount = std::count_if(matchedText.cbegin(), matchedText.cend(), [](const QChar c) {
0111             return c.isDigit();
0112         });
0113 
0114         if (digitsCount > 15) {
0115             return {};
0116         }
0117 
0118         // only one / is allowed, otherwise we trigger on dates
0119         if (matchedText.count(QLatin1Char('/')) > 1) {
0120             return {};
0121         }
0122 
0123         // parenthesis need to be balanced, and must not be nested
0124         int openIdx = -1;
0125         for (int i = 0, size = matchedText.size(); i < size; ++i) {
0126             const QChar ch = matchedText.at(i);
0127             if ((ch == QLatin1Char('(') && openIdx >= 0) || (ch == QLatin1Char(')') && openIdx < 0)) {
0128                 return {};
0129             }
0130 
0131             if (ch == QLatin1Char('(')) {
0132                 openIdx = i;
0133             } else if (ch == QLatin1Char(')')) {
0134                 openIdx = -1;
0135             }
0136         }
0137 
0138         if (openIdx > 0) {
0139             matchedText.truncate(openIdx - 1);
0140             matchedText = matchedText.trimmed();
0141         }
0142 
0143         // check if there's a plausible separator at the end
0144         const int matchedTextLength = matchedText.size();
0145         const int endIdx = mPos + matchedTextLength;
0146         if (endIdx < mText.size() && !QStringView(u" \r\t\n,.").contains(mText.at(endIdx))) {
0147             return {};
0148         }
0149 
0150         mPos += matchedTextLength - 1;
0151         return matchedText.toString();
0152     }
0153     return {};
0154 }
0155 
0156 static QString normalizePhoneNumber(const QString &str)
0157 {
0158     QString res;
0159     res.reserve(str.size());
0160     for (const auto c : str) {
0161         if (c.isDigit() || c == QLatin1Char('+')) {
0162             res.push_back(c);
0163         }
0164     }
0165     return res;
0166 }
0167 
0168 // The following characters are allowed in a dot-atom (RFC 2822):
0169 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
0170 static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~";
0171 
0172 bool KTextToHTMLHelper::atUrl() const
0173 {
0174     // The character directly before the URL must not be a letter, a number or
0175     // any other character allowed in a dot-atom (RFC 2822).
0176     if (mPos > 0) {
0177         const auto chBefore = mText.at(mPos - 1);
0178         if (chBefore.isLetterOrNumber() || QLatin1String(s_allowedSpecialChars).contains(chBefore)) {
0179             return false;
0180         }
0181     }
0182 
0183     const auto segment = QStringView(mText).mid(mPos);
0184     /* clang-format off */
0185     return segment.startsWith(QLatin1String("http://"))
0186         || segment.startsWith(QLatin1String("https://"))
0187         || segment.startsWith(QLatin1String("vnc://"))
0188         || segment.startsWith(QLatin1String("fish://"))
0189         || segment.startsWith(QLatin1String("ftp://"))
0190         || segment.startsWith(QLatin1String("ftps://"))
0191         || segment.startsWith(QLatin1String("sftp://"))
0192         || segment.startsWith(QLatin1String("smb://"))
0193         || segment.startsWith(QLatin1String("irc://"))
0194         || segment.startsWith(QLatin1String("ircs://"))
0195         || segment.startsWith(QLatin1String("mailto:"))
0196         || segment.startsWith(QLatin1String("www."))
0197         || segment.startsWith(QLatin1String("ftp."))
0198         || segment.startsWith(QLatin1String("file://"))
0199         || segment.startsWith(QLatin1String("news:"))
0200         || segment.startsWith(QLatin1String("tel:"))
0201         || segment.startsWith(QLatin1String("xmpp:"));
0202     /* clang-format on */
0203 }
0204 
0205 bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const
0206 {
0207     /* clang-format off */
0208     return url.isEmpty()
0209         || url == QLatin1String("http://")
0210         || url == QLatin1String("https://")
0211         || url == QLatin1String("fish://")
0212         || url == QLatin1String("ftp://")
0213         || url == QLatin1String("ftps://")
0214         || url == QLatin1String("sftp://")
0215         || url == QLatin1String("smb://")
0216         || url == QLatin1String("vnc://")
0217         || url == QLatin1String("irc://")
0218         || url == QLatin1String("ircs://")
0219         || url == QLatin1String("mailto")
0220         || url == QLatin1String("mailto:")
0221         || url == QLatin1String("www")
0222         || url == QLatin1String("ftp")
0223         || url == QLatin1String("news:")
0224         || url == QLatin1String("news://")
0225         || url == QLatin1String("tel")
0226         || url == QLatin1String("tel:")
0227         || url == QLatin1String("xmpp:");
0228     /* clang-format on */
0229 }
0230 
0231 QString KTextToHTMLHelper::getUrl(bool *badurl)
0232 {
0233     QString url;
0234     if (atUrl()) {
0235         // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
0236         // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
0237         // be allowed and should be ignored when the URI is extracted.
0238 
0239         // This implementation follows this recommendation and
0240         // allows the URL to be enclosed within different kind of brackets/quotes
0241         // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
0242         // the URL ends with the first whitespace
0243         // Also, if the URL is enclosed in brackets, the URL itself is not allowed
0244         // to contain the closing bracket, as this would be detected as the end of the URL
0245 
0246         QChar beforeUrl;
0247         QChar afterUrl;
0248 
0249         // detect if the url has been surrounded by brackets or quotes
0250         if (mPos > 0) {
0251             beforeUrl = mText.at(mPos - 1);
0252 
0253             /*if ( beforeUrl == '(' ) {
0254               afterUrl = ')';
0255             } else */
0256             if (beforeUrl == QLatin1Char('[')) {
0257                 afterUrl = QLatin1Char(']');
0258             } else if (beforeUrl == QLatin1Char('<')) {
0259                 afterUrl = QLatin1Char('>');
0260             } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link>
0261                 afterUrl = QLatin1Char('<');
0262             } else if (beforeUrl == QLatin1Char('"')) {
0263                 afterUrl = QLatin1Char('"');
0264             }
0265         }
0266         url.reserve(mMaxUrlLen); // avoid allocs
0267         int start = mPos;
0268         bool previousCharIsSpace = false;
0269         bool previousCharIsADoubleQuote = false;
0270         bool previousIsAnAnchor = false;
0271         /* clang-format off */
0272         while (mPos < mText.length() //
0273                && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace())
0274                && ((afterUrl.isNull() && !mText.at(mPos).isSpace())
0275                    || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) {
0276             if (!previousCharIsSpace
0277                 && mText.at(mPos) == QLatin1Char('<')
0278                 && (mPos + 1) < mText.length()) { /* clang-format on */
0279                 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>"
0280                 // < inside a URL is not allowed, however there is a test which
0281                 // checks that "http://some<Host>/path" should be allowed
0282                 // Therefore: check if what follows is another URL and if so, stop here
0283                 mPos++;
0284                 if (atUrl()) {
0285                     mPos--;
0286                     break;
0287                 }
0288                 mPos--;
0289             }
0290             if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) {
0291                 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/"
0292                 // Therefore: check if what follows is another URL and if so, stop here
0293                 mPos++;
0294                 if (atUrl()) {
0295                     mPos--;
0296                     break;
0297                 }
0298                 mPos--;
0299             }
0300             if (mText.at(mPos).isSpace()) {
0301                 previousCharIsSpace = true;
0302             } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) {
0303                 break;
0304             } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) {
0305                 break;
0306             } else { // skip whitespace
0307                 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) {
0308                     url.append(QLatin1Char(' '));
0309                     break;
0310                 }
0311                 previousCharIsSpace = false;
0312                 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) {
0313                     // it's an invalid url
0314                     if (badurl) {
0315                         *badurl = true;
0316                     }
0317                     return QString();
0318                 }
0319                 if (mText.at(mPos) == QLatin1Char('"')) {
0320                     previousCharIsADoubleQuote = true;
0321                 } else {
0322                     previousCharIsADoubleQuote = false;
0323                 }
0324                 if (mText.at(mPos) == QLatin1Char('#')) {
0325                     previousIsAnAnchor = true;
0326                 }
0327                 url.append(mText.at(mPos));
0328                 if (url.length() > mMaxUrlLen) {
0329                     break;
0330                 }
0331             }
0332 
0333             ++mPos;
0334         }
0335 
0336         if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) {
0337             mPos = start;
0338             url.clear();
0339             return url;
0340         } else {
0341             --mPos;
0342         }
0343     }
0344 
0345     // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
0346     //       their text with "" or <>. That leads to people writing an url, followed immediately by
0347     //       a dot to finish the sentence. That would lead the parser to include the dot in the url,
0348     //       even though that is not wanted. So work around that here.
0349     //       Most real-life URLs hopefully don't end with dots or commas.
0350     QString wordBoundaries = QStringLiteral(".,:!?>");
0351     bool hasOpenParenthese = url.contains(QLatin1Char('('));
0352     if (!hasOpenParenthese) {
0353         wordBoundaries += QLatin1Char(')');
0354     }
0355 
0356     if (url.length() > 1) {
0357         do {
0358             const QChar charact{url.at(url.length() - 1)};
0359             if (wordBoundaries.contains(charact)) {
0360                 url.chop(1);
0361                 --mPos;
0362             } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) {
0363                 if (url.length() > 2) {
0364                     if (url.at(url.length() - 2) == QLatin1Char(')')) {
0365                         url.chop(1);
0366                         --mPos;
0367                         hasOpenParenthese = false;
0368                     } else {
0369                         break;
0370                     }
0371                 } else {
0372                     break;
0373                 }
0374             } else {
0375                 break;
0376             }
0377         } while (url.length() > 1);
0378     }
0379     return url;
0380 }
0381 
0382 QString KTextToHTMLHelper::highlightedText()
0383 {
0384     // formating symbols must be prepended with a whitespace
0385     if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) {
0386         return QString();
0387     }
0388 
0389     const QChar ch = mText.at(mPos);
0390     if (ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-')) {
0391         return QString();
0392     }
0393 
0394     const QRegularExpression re(QStringLiteral("\\%1([^\\s|^\\%1].*[^\\s|^\\%1])\\%1").arg(ch), QRegularExpression::InvertedGreedinessOption);
0395     const auto match =
0396         re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); // clazy:exclude=use-static-qregularexpression
0397 
0398     if (match.hasMatch()) {
0399         if (match.capturedStart() == mPos) {
0400             int length = match.capturedLength();
0401             // there must be a whitespace after the closing formating symbol
0402             if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) {
0403                 return QString();
0404             }
0405             mPos += length - 1;
0406             switch (ch.toLatin1()) {
0407             case '*':
0408                 return QLatin1String("<b>*") + match.capturedView(1) + QLatin1String("*</b>");
0409             case '_':
0410                 return QLatin1String("<u>_") + match.capturedView(1) + QLatin1String("_</u>");
0411             case '/':
0412                 return QLatin1String("<i>/") + match.capturedView(1) + QLatin1String("/</i>");
0413             case '-':
0414                 return QLatin1String("<s>-") + match.capturedView(1) + QLatin1String("-</s>");
0415             }
0416         }
0417     }
0418     return QString();
0419 }
0420 
0421 QString KTextToHTML::convertToHtml(const QString &plainText, const KTextToHTML::Options &flags, int maxUrlLen, int maxAddressLen)
0422 {
0423     KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen);
0424 
0425     QString str;
0426     QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2);
0427     QChar ch;
0428     int x;
0429     bool startOfLine = true;
0430 
0431     for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) {
0432         ch = helper.mText.at(helper.mPos);
0433         if (flags & PreserveSpaces) {
0434             if (ch == QLatin1Char(' ')) {
0435                 if (helper.mPos + 1 < helper.mText.length()) {
0436                     if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) {
0437                         // A single space, make it breaking if not at the start or end of the line
0438                         const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n');
0439                         if (!startOfLine && !endOfLine) {
0440                             result += QLatin1Char(' ');
0441                         } else {
0442                             result += QLatin1String("&nbsp;");
0443                         }
0444                     } else {
0445                         // Whitespace of more than one space, make it all non-breaking
0446                         while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) {
0447                             result += QLatin1String("&nbsp;");
0448                             ++helper.mPos;
0449                             ++x;
0450                         }
0451 
0452                         // We incremented once to often, undo that
0453                         --helper.mPos;
0454                         --x;
0455                     }
0456                 } else {
0457                     // Last space in the text, it is non-breaking
0458                     result += QLatin1String("&nbsp;");
0459                 }
0460 
0461                 if (startOfLine) {
0462                     startOfLine = false;
0463                 }
0464                 continue;
0465             } else if (ch == QLatin1Char('\t')) {
0466                 do {
0467                     result += QLatin1String("&nbsp;");
0468                     ++x;
0469                 } while ((x & 7) != 0);
0470                 --x;
0471                 startOfLine = false;
0472                 continue;
0473             }
0474         }
0475         if (ch == QLatin1Char('\n')) {
0476             result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
0477             startOfLine = true;
0478             x = -1;
0479             continue;
0480         }
0481 
0482         startOfLine = false;
0483         if (ch == QLatin1Char('&')) {
0484             result += QLatin1String("&amp;");
0485         } else if (ch == QLatin1Char('"')) {
0486             result += QLatin1String("&quot;");
0487         } else if (ch == QLatin1Char('<')) {
0488             result += QLatin1String("&lt;");
0489         } else if (ch == QLatin1Char('>')) {
0490             result += QLatin1String("&gt;");
0491         } else {
0492             const int start = helper.mPos;
0493             if (!(flags & IgnoreUrls)) {
0494                 bool badUrl = false;
0495                 str = helper.getUrl(&badUrl);
0496                 if (badUrl) {
0497                     QString resultBadUrl;
0498                     for (const QChar chBadUrl : std::as_const(helper.mText)) {
0499                         if (chBadUrl == QLatin1Char('&')) {
0500                             resultBadUrl += QLatin1String("&amp;");
0501                         } else if (chBadUrl == QLatin1Char('"')) {
0502                             resultBadUrl += QLatin1String("&quot;");
0503                         } else if (chBadUrl == QLatin1Char('<')) {
0504                             resultBadUrl += QLatin1String("&lt;");
0505                         } else if (chBadUrl == QLatin1Char('>')) {
0506                             resultBadUrl += QLatin1String("&gt;");
0507                         } else {
0508                             resultBadUrl += chBadUrl;
0509                         }
0510                     }
0511                     return resultBadUrl;
0512                 }
0513                 if (!str.isEmpty()) {
0514                     QString hyperlink;
0515                     if (str.startsWith(QLatin1String("www."))) {
0516                         hyperlink = QLatin1String("http://") + str;
0517                     } else if (str.startsWith(QLatin1String("ftp."))) {
0518                         hyperlink = QLatin1String("ftp://") + str;
0519                     } else {
0520                         hyperlink = str;
0521                     }
0522                     result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>");
0523                     x += helper.mPos - start;
0524                     continue;
0525                 }
0526                 str = helper.getEmailAddress();
0527                 if (!str.isEmpty()) {
0528                     // len is the length of the local part
0529                     int len = str.indexOf(QLatin1Char('@'));
0530                     QString localPart = str.left(len);
0531 
0532                     // remove the local part from the result (as '&'s have been expanded to
0533                     // &amp; we have to take care of the 4 additional characters per '&')
0534                     result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4));
0535                     x -= len;
0536 
0537                     result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
0538                     x += str.length() - 1;
0539                     continue;
0540                 }
0541                 if (flags & ConvertPhoneNumbers) {
0542                     str = helper.getPhoneNumber();
0543                     if (!str.isEmpty()) {
0544                         result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>");
0545                         x += str.length() - 1;
0546                         continue;
0547                     }
0548                 }
0549             }
0550             if (flags & HighlightText) {
0551                 str = helper.highlightedText();
0552                 if (!str.isEmpty()) {
0553                     result += str;
0554                     x += helper.mPos - start;
0555                     continue;
0556                 }
0557             }
0558             result += ch;
0559         }
0560     }
0561 
0562     if (flags & ReplaceSmileys) {
0563         result = KEmoticonsParser::parseEmoticons(result);
0564     }
0565 
0566     return result;
0567 }