core/ktexttohtmlfork/ruqolaktexttohtml.cpp

0001 /*
0002     SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com>
0003     SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
0004
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007
0008 #include "ruqolaktexttohtml.h"
0009 #include "ruqolaktexttohtml_p.h"
0010
0011 #include <QRegularExpression>
0012
0013 #include <climits>
0014
0015 KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen)
0016     : mText(plainText)
0017     , mMaxUrlLen(maxUrlLen)
0018     , mMaxAddressLen(maxAddressLen)
0019     , mPos(pos)
0020 {
0021 }
0022
0023 QString KTextToHTMLHelper::getEmailAddress()
0024 {
0025     QString address;
0026
0027     if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) {
0028         // the following characters are allowed in a dot-atom (RFC 2822):
0029         // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
0030         const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~");
0031
0032         // determine the local part of the email address
0033         int start = mPos - 1;
0034         while (start >= 0 && mText.at(start).unicode() < 128
0035                && (mText.at(start).isLetterOrNumber() || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses
0036                    || allowedSpecialChars.indexOf(mText.at(start)) != -1)) {
0037             if (mText.at(start) == QLatin1Char('@')) {
0038                 return {}; // local part contains '@' -> no email address
0039             }
0040             --start;
0041         }
0042         ++start;
0043         // we assume that an email address starts with a letter or a digit
0044         while ((start < mPos) && !mText.at(start).isLetterOrNumber()) {
0045             ++start;
0046         }
0047         if (start == mPos) {
0048             return {}; // local part is empty -> no email address
0049         }
0050
0051         // determine the domain part of the email address
0052         int dotPos = INT_MAX;
0053         int end = mPos + 1;
0054         while (end < mText.length()
0055                && (mText.at(end).isLetterOrNumber() || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses
0056                    || mText.at(end) == QLatin1Char('.') || mText.at(end) == QLatin1Char('-'))) {
0057             if (mText.at(end) == QLatin1Char('@')) {
0058                 return {}; // domain part contains '@' -> no email address
0059             }
0060             if (mText.at(end) == QLatin1Char('.')) {
0061                 dotPos = qMin(dotPos, end); // remember index of first dot in domain
0062             }
0063             ++end;
0064         }
0065         // we assume that an email address ends with a letter or a digit
0066         while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) {
0067             --end;
0068         }
0069         if (end == mPos) {
0070             return {}; // domain part is empty -> no email address
0071         }
0072         if (dotPos >= end) {
0073             return {}; // domain part doesn't contain a dot
0074         }
0075
0076         if (end - start > mMaxAddressLen) {
0077             return {}; // too long -> most likely no email address
0078         }
0079         address = mText.mid(start, end - start);
0080
0081         mPos = end - 1;
0082     }
0083     return address;
0084 }
0085
0086 QString KTextToHTMLHelper::getPhoneNumber()
0087 {
0088     if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) {
0089         return {};
0090     }
0091
0092     const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:");
0093     if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) {
0094         return {};
0095     }
0096
0097     // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp
0098     static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})"));
0099 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0100     const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption);
0101 #else
0102     const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption);
0103 #endif
0104     if (match.hasMatch()) {
0105         auto m = match.captured();
0106         // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan
0107         if (std::count_if(m.begin(),
0108                           m.end(),
0109                           [](QChar c) {
0110                               return c.isDigit();
0111                           })
0112             > 15) {
0113             return {};
0114         }
0115         // only one / is allowed, otherwise we trigger on dates
0116         if (std::count(m.begin(), m.end(), QLatin1Char('/')) > 1) {
0117             return {};
0118         }
0119
0120         // parenthesis need to be balanced, and must not be nested
0121         int openIdx = -1;
0122         for (int i = 0; i < m.size(); ++i) {
0123             if ((m[i] == QLatin1Char('(') && openIdx >= 0) || (m[i] == QLatin1Char(')') && openIdx < 0)) {
0124                 return {};
0125             }
0126             if (m[i] == QLatin1Char('(')) {
0127                 openIdx = i;
0128             } else if (m[i] == QLatin1Char(')')) {
0129                 openIdx = -1;
0130             }
0131         }
0132         if (openIdx > 0) {
0133 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0134             m = m.leftRef(openIdx - 1).trimmed().toString();
0135 #else
0136             m = QStringView(m).left(openIdx - 1).trimmed().toString();
0137 #endif
0138         }
0139
0140         // check if there's a plausible separator at the end
0141         const QString allowedEndSeparators = QStringLiteral(" \r\t\n,.");
0142         const auto l = m.size();
0143         if (mText.size() > mPos + l && !allowedEndSeparators.contains(mText.at(mPos + l))) {
0144             return {};
0145         }
0146
0147         mPos += l - 1;
0148         return m;
0149     }
0150     return {};
0151 }
0152
0153 static QString normalizePhoneNumber(const QString &str)
0154 {
0155     QString res;
0156     res.reserve(str.size());
0157     for (const auto c : str) {
0158         if (c.isDigit() || c == QLatin1Char('+')) {
0159             res.push_back(c);
0160         }
0161     }
0162     return res;
0163 }
0164
0165 // The following characters are allowed in a dot-atom (RFC 2822):
0166 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
0167 static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~";
0168
0169 bool KTextToHTMLHelper::atUrl() const
0170 {
0171     // The character directly before the URL must not be a letter, a number or
0172     // any other character allowed in a dot-atom (RFC 2822).
0173     if (mPos > 0) {
0174         const auto chBefore = mText.at(mPos - 1);
0175         if (chBefore.isLetterOrNumber() || QString::fromLatin1(s_allowedSpecialChars).contains(chBefore)) {
0176             return false;
0177         }
0178     }
0179
0180     const auto segment = QStringView(mText).mid(mPos);
0181     return segment.startsWith(QLatin1String("http://")) || segment.startsWith(QLatin1String("https://")) || segment.startsWith(QLatin1String("vnc://"))
0182         || segment.startsWith(QLatin1String("fish://")) || segment.startsWith(QLatin1String("ftp://")) || segment.startsWith(QLatin1String("ftps://"))
0183         || segment.startsWith(QLatin1String("sftp://")) || segment.startsWith(QLatin1String("smb://")) || segment.startsWith(QLatin1String("mailto:"))
0184         || segment.startsWith(QLatin1String("www.")) || segment.startsWith(QLatin1String("ftp.")) || segment.startsWith(QLatin1String("file://"))
0185         || segment.startsWith(QLatin1String("news:")) || segment.startsWith(QLatin1String("tel:")) || segment.startsWith(QLatin1String("xmpp:"))
0186         || segment.startsWith(QLatin1String("irc://")) || segment.startsWith(QLatin1String("ircs://"));
0187 }
0188
0189 bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const
0190 {
0191     return url.isEmpty() || url == QLatin1String("http://") || url == QLatin1String("https://") || url == QLatin1String("fish://")
0192         || url == QLatin1String("ftp://") || url == QLatin1String("ftps://") || url == QLatin1String("sftp://") || url == QLatin1String("smb://")
0193         || url == QLatin1String("vnc://") || url == QLatin1String("mailto") || url == QLatin1String("mailto:") || url == QLatin1String("www")
0194         || url == QLatin1String("ftp") || url == QLatin1String("news:") || url == QLatin1String("news://") || url == QLatin1String("tel")
0195         || url == QLatin1String("tel:") || url == QLatin1String("xmpp:") || url == QLatin1String("irc://") || url == QLatin1String("ircs://");
0196 }
0197
0198 QString KTextToHTMLHelper::getUrl(bool *badurl)
0199 {
0200     QString url;
0201     if (atUrl()) {
0202         // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
0203         // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
0204         // be allowed and should be ignored when the URI is extracted.
0205
0206         // This implementation follows this recommendation and
0207         // allows the URL to be enclosed within different kind of brackets/quotes
0208         // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
0209         // the URL ends with the first whitespace
0210         // Also, if the URL is enclosed in brackets, the URL itself is not allowed
0211         // to contain the closing bracket, as this would be detected as the end of the URL
0212
0213         QChar beforeUrl;
0214         QChar afterUrl;
0215
0216         // detect if the url has been surrounded by brackets or quotes
0217         if (mPos > 0) {
0218             beforeUrl = mText.at(mPos - 1);
0219
0220             /* if ( beforeUrl == QLatin1Char('(') ) {
0221               afterUrl = QLatin1Char(')');
0222             } else */
0223             if (beforeUrl == QLatin1Char('[')) {
0224                 afterUrl = QLatin1Char(']');
0225             } else if (beforeUrl == QLatin1Char('<')) {
0226                 afterUrl = QLatin1Char('>');
0227             } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link>
0228                 afterUrl = QLatin1Char('<');
0229             } else if (beforeUrl == QLatin1Char('"')) {
0230                 afterUrl = QLatin1Char('"');
0231             }
0232         }
0233         url.reserve(mMaxUrlLen); // avoid allocs
0234         int start = mPos;
0235         bool previousCharIsSpace = false;
0236         bool previousCharIsADoubleQuote = false;
0237         bool previousIsAnAnchor = false;
0238         while ((mPos < mText.length()) && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace())
0239                && ((afterUrl.isNull() && !mText.at(mPos).isSpace()) || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) {
0240             if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char('<')) && ((mPos + 1) < mText.length())) {
0241                 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>"
0242                 // < inside a URL is not allowed, however there is a test which
0243                 // checks that "http://some<Host>/path" should be allowed
0244                 // Therefore: check if what follows is another URL and if so, stop here
0245                 mPos++;
0246                 if (atUrl()) {
0247                     mPos--;
0248                     break;
0249                 }
0250                 mPos--;
0251             }
0252             if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) {
0253                 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/"
0254                 // Therefore: check if what follows is another URL and if so, stop here
0255                 mPos++;
0256                 if (atUrl()) {
0257                     mPos--;
0258                     break;
0259                 }
0260                 mPos--;
0261             }
0262             if (mText.at(mPos).isSpace()) {
0263                 previousCharIsSpace = true;
0264             } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) {
0265                 break;
0266             } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) {
0267                 break;
0268             } else { // skip whitespace
0269                 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) {
0270                     url.append(QLatin1Char(' '));
0271                     break;
0272                 }
0273                 previousCharIsSpace = false;
0274                 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) {
0275                     // it's an invalid url
0276                     if (badurl) {
0277                         *badurl = true;
0278                     }
0279                     return {};
0280                 }
0281                 if (mText.at(mPos) == QLatin1Char('"')) {
0282                     previousCharIsADoubleQuote = true;
0283                 } else {
0284                     previousCharIsADoubleQuote = false;
0285                 }
0286                 if (mText.at(mPos) == QLatin1Char('#')) {
0287                     previousIsAnAnchor = true;
0288                 }
0289                 url.append(mText.at(mPos));
0290                 if (url.length() > mMaxUrlLen) {
0291                     break;
0292                 }
0293             }
0294
0295             ++mPos;
0296         }
0297
0298         if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) {
0299             mPos = start;
0300             url.clear();
0301             return url;
0302         } else {
0303             --mPos;
0304         }
0305     }
0306
0307     // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
0308     //       their text with "" or <>. That leads to people writing an url, followed immediately by
0309     //       a dot to finish the sentence. That would lead the parser to include the dot in the url,
0310     //       even though that is not wanted. So work around that here.
0311     //       Most real-life URLs hopefully don't end with dots or commas.
0312     QString wordBoundaries = QStringLiteral(".,:!?>");
0313     bool hasOpenParenthese = url.contains(QLatin1Char('('));
0314     if (!hasOpenParenthese) {
0315         wordBoundaries += QLatin1Char(')');
0316     }
0317     if (url.length() > 1) {
0318         do {
0319             const QChar charact{url.at(url.length() - 1)};
0320             if (wordBoundaries.contains(charact)) {
0321                 url.chop(1);
0322                 --mPos;
0323             } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) {
0324                 if (url.length() > 2) {
0325                     if (url.at(url.length() - 2) == QLatin1Char(')')) {
0326                         url.chop(1);
0327                         --mPos;
0328                         hasOpenParenthese = false;
0329                     } else {
0330                         break;
0331                     }
0332                 } else {
0333                     break;
0334                 }
0335             } else {
0336                 break;
0337             }
0338         } while (url.length() > 1);
0339     }
0340     return url;
0341 }
0342
0343 QString KTextToHTMLHelper::highlightedText()
0344 {
0345     // formatting symbols must be prepended with a whitespace
0346     if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) {
0347         return {};
0348     }
0349
0350     const QChar ch = mText.at(mPos);
0351     if (ch != QLatin1Char('~') && ch != QLatin1Char('*') && ch != QLatin1Char('_')) {
0352         return {};
0353     }
0354
0355     QRegularExpression re(QStringLiteral("\\%1+\\s*([^\\s|^\\%1].*[^\\s|^\\%1])\\s*\\%1+").arg(ch));
0356     re.setPatternOptions(QRegularExpression::InvertedGreedinessOption);
0357 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0358     const auto match = re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption);
0359 #else
0360     const auto match = re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption);
0361 #endif
0362
0363     if (match.hasMatch()) {
0364         if (match.capturedStart() == mPos) {
0365             const int length = match.capturedLength();
0366             // there must be a whitespace after the closing formatting symbol
0367             if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) {
0368                 return {};
0369             }
0370             mPos += length - 1;
0371             switch (ch.toLatin1()) {
0372 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0373             case '*':
0374                 return QStringLiteral("<b>") + match.capturedRef(1) + QStringLiteral("</b>");
0375             case '_':
0376                 return QStringLiteral("<i>") + match.capturedRef(1) + QStringLiteral("</i>");
0377             case '~':
0378                 return QStringLiteral("<s>") + match.capturedRef(1) + QStringLiteral("</s>");
0379             }
0380 #else
0381             case '*':
0382                 return QStringLiteral("<b>") + match.capturedView(1).toString() + QStringLiteral("</b>");
0383             case '_':
0384                 return QStringLiteral("<i>") + match.capturedView(1).toString() + QStringLiteral("</i>");
0385             case '~':
0386                 return QStringLiteral("<s>") + match.capturedView(1).toString() + QStringLiteral("</s>");
0387             }
0388 #endif
0389         }
0390     }
0391     return {};
0392 }
0393
0394 QString RuqolaKTextToHTML::convertToHtml(const QString &plainText, RuqolaKTextToHTML::Options flags, int maxUrlLen, int maxAddressLen)
0395 {
0396     KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen);
0397
0398     QString str;
0399     QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2);
0400     QChar ch;
0401     int x;
0402     bool startOfLine = true;
0403
0404     for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) {
0405         ch = helper.mText.at(helper.mPos);
0406         if (flags & PreserveSpaces) {
0407             if (ch == QLatin1Char(' ')) {
0408                 if (helper.mPos + 1 < helper.mText.length()) {
0409                     if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) {
0410                         // A single space, make it breaking if not at the start or end of the line
0411                         const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n');
0412                         if (!startOfLine && !endOfLine) {
0413                             result += QLatin1Char(' ');
0414                         } else {
0415                             result += QLatin1String("&nbsp;");
0416                         }
0417                     } else {
0418                         // Whitespace of more than one space, make it all non-breaking
0419                         while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) {
0420                             result += QLatin1String("&nbsp;");
0421                             ++helper.mPos;
0422                             ++x;
0423                         }
0424
0425                         // We incremented once to often, undo that
0426                         --helper.mPos;
0427                         --x;
0428                     }
0429                 } else {
0430                     // Last space in the text, it is non-breaking
0431                     result += QLatin1String("&nbsp;");
0432                 }
0433
0434                 if (startOfLine) {
0435                     startOfLine = false;
0436                 }
0437                 continue;
0438             } else if (ch == QLatin1Char('\t')) {
0439                 do {
0440                     result += QLatin1String("&nbsp;");
0441                     ++x;
0442                 } while ((x & 7) != 0);
0443                 --x;
0444                 startOfLine = false;
0445                 continue;
0446             }
0447         }
0448         if (ch == QLatin1Char('\n')) {
0449             result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
0450             startOfLine = true;
0451             x = -1;
0452             continue;
0453         }
0454
0455         startOfLine = false;
0456         if (ch == QLatin1Char('&')) {
0457             result += QLatin1String("&amp;");
0458         } else if (ch == QLatin1Char('"')) {
0459             result += QLatin1String("&quot;");
0460         } else if (ch == QLatin1Char('<')) {
0461             result += QLatin1String("&lt;");
0462         } else if (ch == QLatin1Char('>')) {
0463             result += QLatin1String("&gt;");
0464         } else {
0465             const int start = helper.mPos;
0466             if (!(flags & IgnoreUrls)) {
0467                 bool badUrl = false;
0468                 str = helper.getUrl(&badUrl);
0469                 if (badUrl) {
0470                     QString resultBadUrl;
0471                     const int helperTextSize(helper.mText.length());
0472                     for (int i = 0; i < helperTextSize; ++i) {
0473                         const QChar chBadUrl = helper.mText.at(i);
0474                         if (chBadUrl == QLatin1Char('&')) {
0475                             resultBadUrl += QLatin1String("&amp;");
0476                         } else if (chBadUrl == QLatin1Char('"')) {
0477                             resultBadUrl += QLatin1String("&quot;");
0478                         } else if (chBadUrl == QLatin1Char('<')) {
0479                             resultBadUrl += QLatin1String("&lt;");
0480                         } else if (chBadUrl == QLatin1Char('>')) {
0481                             resultBadUrl += QLatin1String("&gt;");
0482                         } else {
0483                             resultBadUrl += chBadUrl;
0484                         }
0485                     }
0486                     return resultBadUrl;
0487                 }
0488                 if (!str.isEmpty()) {
0489                     QString hyperlink;
0490                     if (str.startsWith(QLatin1String("www."))) {
0491                         hyperlink = QLatin1String("http://") + str;
0492                     } else if (str.startsWith(QLatin1String("ftp."))) {
0493                         hyperlink = QLatin1String("ftp://") + str;
0494                     } else {
0495                         hyperlink = str;
0496                     }
0497                     if (hyperlink.endsWith(QLatin1Char('"'))) {
0498                         hyperlink.chop(1);
0499                     }
0500                     if (str.endsWith(QLatin1Char('"'))) {
0501                         str.chop(1);
0502                     }
0503                     result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>");
0504                     x += helper.mPos - start;
0505                     continue;
0506                 }
0507                 str = helper.getEmailAddress();
0508                 if (!str.isEmpty()) {
0509                     // len is the length of the local part
0510                     int len = str.indexOf(QLatin1Char('@'));
0511                     QString localPart = str.left(len);
0512
0513                     // remove the local part from the result (as '&'s have been expanded to
0514                     // &amp; we have to take care of the 4 additional characters per '&')
0515                     result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4));
0516                     x -= len;
0517
0518                     result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
0519                     x += str.length() - 1;
0520                     continue;
0521                 }
0522                 if (flags & ConvertPhoneNumbers) {
0523                     str = helper.getPhoneNumber();
0524                     if (!str.isEmpty()) {
0525                         result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>");
0526                         x += str.length() - 1;
0527                         continue;
0528                     }
0529                 }
0530             }
0531             if (flags & HighlightText) {
0532                 str = helper.highlightedText();
0533                 if (!str.isEmpty()) {
0534                     result += str;
0535                     x += helper.mPos - start;
0536                     continue;
0537                 }
0538             }
0539             result += ch;
0540         }
0541     }
0542
0543     return result;
0544 }