core/ktexttohtmlfork/ruqolaktexttohtml.cpp

0001 /*
0002     SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com>
0003     SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
0004
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007
0008 #include "ruqolaktexttohtml.h"
0009 #include "ruqolaktexttohtml_p.h"
0010
0011 #include <QRegularExpression>
0012
0013 #include <climits>
0014
0015 KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen)
0016     : mText(plainText)
0017     , mMaxUrlLen(maxUrlLen)
0018     , mMaxAddressLen(maxAddressLen)
0019     , mPos(pos)
0020 {
0021 }
0022
0023 QString KTextToHTMLHelper::getEmailAddress()
0024 {
0025     QString address;
0026
0027     if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) {
0028         // the following characters are allowed in a dot-atom (RFC 2822):
0029         // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
0030         const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~");
0031
0032         // determine the local part of the email address
0033         int start = mPos - 1;
0034         while (start >= 0 && mText.at(start).unicode() < 128
0035                && (mText.at(start).isLetterOrNumber() || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses
0036                    || allowedSpecialChars.indexOf(mText.at(start)) != -1)) {
0037             if (mText.at(start) == QLatin1Char('@')) {
0038                 return {}; // local part contains '@' -> no email address
0039             }
0040             --start;
0041         }
0042         ++start;
0043         // we assume that an email address starts with a letter or a digit
0044         while ((start < mPos) && !mText.at(start).isLetterOrNumber()) {
0045             ++start;
0046         }
0047         if (start == mPos) {
0048             return {}; // local part is empty -> no email address
0049         }
0050
0051         // determine the domain part of the email address
0052         int dotPos = INT_MAX;
0053         int end = mPos + 1;
0054         while (end < mText.length()
0055                && (mText.at(end).isLetterOrNumber() || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses
0056                    || mText.at(end) == QLatin1Char('.') || mText.at(end) == QLatin1Char('-'))) {
0057             if (mText.at(end) == QLatin1Char('@')) {
0058                 return {}; // domain part contains '@' -> no email address
0059             }
0060             if (mText.at(end) == QLatin1Char('.')) {
0061                 dotPos = qMin(dotPos, end); // remember index of first dot in domain
0062             }
0063             ++end;
0064         }
0065         // we assume that an email address ends with a letter or a digit
0066         while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) {
0067             --end;
0068         }
0069         if (end == mPos) {
0070             return {}; // domain part is empty -> no email address
0071         }
0072         if (dotPos >= end) {
0073             return {}; // domain part doesn't contain a dot
0074         }
0075
0076         if (end - start > mMaxAddressLen) {
0077             return {}; // too long -> most likely no email address
0078         }
0079         address = mText.mid(start, end - start);
0080
0081         mPos = end - 1;
0082     }
0083     return address;
0084 }
0085
0086 QString KTextToHTMLHelper::getPhoneNumber()
0087 {
0088     if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) {
0089         return {};
0090     }
0091
0092     const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:");
0093     if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) {
0094         return {};
0095     }
0096
0097     // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp
0098     static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\(?\d+\)?+){6,30})"));
0099     const auto match = telPattern.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption);
0100     if (match.hasMatch()) {
0101         auto m = match.captured();
0102         // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan
0103         if (std::count_if(m.begin(),
0104                           m.end(),
0105                           [](QChar c) {
0106                               return c.isDigit();
0107                           })
0108             > 15) {
0109             return {};
0110         }
0111         // only one / is allowed, otherwise we trigger on dates
0112         if (std::count(m.begin(), m.end(), QLatin1Char('/')) > 1) {
0113             return {};
0114         }
0115
0116         // parenthesis need to be balanced, and must not be nested
0117         int openIdx = -1;
0118         for (int i = 0; i < m.size(); ++i) {
0119             if ((m[i] == QLatin1Char('(') && openIdx >= 0) || (m[i] == QLatin1Char(')') && openIdx < 0)) {
0120                 return {};
0121             }
0122             if (m[i] == QLatin1Char('(')) {
0123                 openIdx = i;
0124             } else if (m[i] == QLatin1Char(')')) {
0125                 openIdx = -1;
0126             }
0127         }
0128         if (openIdx > 0) {
0129             m = QStringView(m).left(openIdx - 1).trimmed().toString();
0130         }
0131
0132         // check if there's a plausible separator at the end
0133         const QString allowedEndSeparators = QStringLiteral(" \r\t\n,.");
0134         const auto l = m.size();
0135         if (mText.size() > mPos + l && !allowedEndSeparators.contains(mText.at(mPos + l))) {
0136             return {};
0137         }
0138
0139         mPos += l - 1;
0140         return m;
0141     }
0142     return {};
0143 }
0144
0145 static QString normalizePhoneNumber(const QString &str)
0146 {
0147     QString res;
0148     res.reserve(str.size());
0149     for (const auto c : str) {
0150         if (c.isDigit() || c == QLatin1Char('+')) {
0151             res.push_back(c);
0152         }
0153     }
0154     return res;
0155 }
0156
0157 // The following characters are allowed in a dot-atom (RFC 2822):
0158 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
0159 static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~";
0160
0161 bool KTextToHTMLHelper::atUrl() const
0162 {
0163     // The character directly before the URL must not be a letter, a number or
0164     // any other character allowed in a dot-atom (RFC 2822).
0165     if (mPos > 0) {
0166         const auto chBefore = mText.at(mPos - 1);
0167         if (chBefore.isLetterOrNumber() || QString::fromLatin1(s_allowedSpecialChars).contains(chBefore)) {
0168             return false;
0169         }
0170     }
0171
0172     const auto segment = QStringView(mText).mid(mPos);
0173     return segment.startsWith(QLatin1String("http://")) || segment.startsWith(QLatin1String("https://")) || segment.startsWith(QLatin1String("vnc://"))
0174         || segment.startsWith(QLatin1String("fish://")) || segment.startsWith(QLatin1String("ftp://")) || segment.startsWith(QLatin1String("ftps://"))
0175         || segment.startsWith(QLatin1String("sftp://")) || segment.startsWith(QLatin1String("smb://")) || segment.startsWith(QLatin1String("mailto:"))
0176         || segment.startsWith(QLatin1String("www.")) || segment.startsWith(QLatin1String("ftp.")) || segment.startsWith(QLatin1String("file://"))
0177         || segment.startsWith(QLatin1String("news:")) || segment.startsWith(QLatin1String("tel:")) || segment.startsWith(QLatin1String("xmpp:"))
0178         || segment.startsWith(QLatin1String("irc://")) || segment.startsWith(QLatin1String("ircs://"));
0179 }
0180
0181 bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const
0182 {
0183     return url.isEmpty() || url == QLatin1String("http://") || url == QLatin1String("https://") || url == QLatin1String("fish://")
0184         || url == QLatin1String("ftp://") || url == QLatin1String("ftps://") || url == QLatin1String("sftp://") || url == QLatin1String("smb://")
0185         || url == QLatin1String("vnc://") || url == QLatin1String("mailto") || url == QLatin1String("mailto:") || url == QLatin1String("www")
0186         || url == QLatin1String("ftp") || url == QLatin1String("news:") || url == QLatin1String("news://") || url == QLatin1String("tel")
0187         || url == QLatin1String("tel:") || url == QLatin1String("xmpp:") || url == QLatin1String("irc://") || url == QLatin1String("ircs://");
0188 }
0189
0190 QString KTextToHTMLHelper::getUrl(bool *badurl)
0191 {
0192     QString url;
0193     if (atUrl()) {
0194         // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
0195         // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
0196         // be allowed and should be ignored when the URI is extracted.
0197
0198         // This implementation follows this recommendation and
0199         // allows the URL to be enclosed within different kind of brackets/quotes
0200         // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
0201         // the URL ends with the first whitespace
0202         // Also, if the URL is enclosed in brackets, the URL itself is not allowed
0203         // to contain the closing bracket, as this would be detected as the end of the URL
0204
0205         QChar beforeUrl;
0206         QChar afterUrl;
0207
0208         // detect if the url has been surrounded by brackets or quotes
0209         if (mPos > 0) {
0210             beforeUrl = mText.at(mPos - 1);
0211
0212             /* if ( beforeUrl == QLatin1Char('(') ) {
0213               afterUrl = QLatin1Char(')');
0214             } else */
0215             if (beforeUrl == QLatin1Char('[')) {
0216                 afterUrl = QLatin1Char(']');
0217             } else if (beforeUrl == QLatin1Char('<')) {
0218                 afterUrl = QLatin1Char('>');
0219             } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link>
0220                 afterUrl = QLatin1Char('<');
0221             } else if (beforeUrl == QLatin1Char('"')) {
0222                 afterUrl = QLatin1Char('"');
0223             }
0224         }
0225         url.reserve(mMaxUrlLen); // avoid allocs
0226         int start = mPos;
0227         bool previousCharIsSpace = false;
0228         bool previousCharIsADoubleQuote = false;
0229         bool previousIsAnAnchor = false;
0230         while ((mPos < mText.length()) && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace())
0231                && ((afterUrl.isNull() && !mText.at(mPos).isSpace()) || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) {
0232             if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char('<')) && ((mPos + 1) < mText.length())) {
0233                 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>"
0234                 // < inside a URL is not allowed, however there is a test which
0235                 // checks that "http://some<Host>/path" should be allowed
0236                 // Therefore: check if what follows is another URL and if so, stop here
0237                 mPos++;
0238                 if (atUrl()) {
0239                     mPos--;
0240                     break;
0241                 }
0242                 mPos--;
0243             }
0244             if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) {
0245                 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/"
0246                 // Therefore: check if what follows is another URL and if so, stop here
0247                 mPos++;
0248                 if (atUrl()) {
0249                     mPos--;
0250                     break;
0251                 }
0252                 mPos--;
0253             }
0254             if (mText.at(mPos).isSpace()) {
0255                 previousCharIsSpace = true;
0256             } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) {
0257                 break;
0258             } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) {
0259                 break;
0260             } else { // skip whitespace
0261                 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) {
0262                     url.append(QLatin1Char(' '));
0263                     break;
0264                 }
0265                 previousCharIsSpace = false;
0266                 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) {
0267                     // it's an invalid url
0268                     if (badurl) {
0269                         *badurl = true;
0270                     }
0271                     return {};
0272                 }
0273                 if (mText.at(mPos) == QLatin1Char('"')) {
0274                     previousCharIsADoubleQuote = true;
0275                 } else {
0276                     previousCharIsADoubleQuote = false;
0277                 }
0278                 if (mText.at(mPos) == QLatin1Char('#')) {
0279                     previousIsAnAnchor = true;
0280                 }
0281                 url.append(mText.at(mPos));
0282                 if (url.length() > mMaxUrlLen) {
0283                     break;
0284                 }
0285             }
0286
0287             ++mPos;
0288         }
0289
0290         if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) {
0291             mPos = start;
0292             url.clear();
0293             return url;
0294         } else {
0295             --mPos;
0296         }
0297     }
0298
0299     // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
0300     //       their text with "" or <>. That leads to people writing an url, followed immediately by
0301     //       a dot to finish the sentence. That would lead the parser to include the dot in the url,
0302     //       even though that is not wanted. So work around that here.
0303     //       Most real-life URLs hopefully don't end with dots or commas.
0304     QString wordBoundaries = QStringLiteral(".,:!?>");
0305     bool hasOpenParenthese = url.contains(QLatin1Char('('));
0306     if (!hasOpenParenthese) {
0307         wordBoundaries += QLatin1Char(')');
0308     }
0309     if (url.length() > 1) {
0310         do {
0311             const QChar charact{url.at(url.length() - 1)};
0312             if (wordBoundaries.contains(charact)) {
0313                 url.chop(1);
0314                 --mPos;
0315             } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) {
0316                 if (url.length() > 2) {
0317                     if (url.at(url.length() - 2) == QLatin1Char(')')) {
0318                         url.chop(1);
0319                         --mPos;
0320                         hasOpenParenthese = false;
0321                     } else {
0322                         break;
0323                     }
0324                 } else {
0325                     break;
0326                 }
0327             } else {
0328                 break;
0329             }
0330         } while (url.length() > 1);
0331     }
0332     return url;
0333 }
0334
0335 QString KTextToHTMLHelper::highlightedText()
0336 {
0337     // formatting symbols must be prepended with a whitespace
0338     if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) {
0339         return {};
0340     }
0341
0342     const QChar ch = mText.at(mPos);
0343     if (ch != QLatin1Char('~') && ch != QLatin1Char('*') && ch != QLatin1Char('_')) {
0344         return {};
0345     }
0346
0347     QRegularExpression re(QStringLiteral("\\%1+\\s*([^\\s|^\\%1].*[^\\s|^\\%1])\\s*\\%1+").arg(ch));
0348     re.setPatternOptions(QRegularExpression::InvertedGreedinessOption);
0349     const auto match = re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption);
0350
0351     if (match.hasMatch()) {
0352         if (match.capturedStart() == mPos) {
0353             const int length = match.capturedLength();
0354             // there must be a whitespace after the closing formatting symbol
0355             if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) {
0356                 return {};
0357             }
0358             mPos += length - 1;
0359             switch (ch.toLatin1()) {
0360             case '*':
0361                 return QStringLiteral("<b>") + match.capturedView(1).toString() + QStringLiteral("</b>");
0362             case '_':
0363                 return QStringLiteral("<i>") + match.capturedView(1).toString() + QStringLiteral("</i>");
0364             case '~':
0365                 return QStringLiteral("<s>") + match.capturedView(1).toString() + QStringLiteral("</s>");
0366             }
0367         }
0368     }
0369     return {};
0370 }
0371
0372 QString RuqolaKTextToHTML::convertToHtml(const QString &plainText, RuqolaKTextToHTML::Options flags, int maxUrlLen, int maxAddressLen)
0373 {
0374     KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen);
0375
0376     QString str;
0377     QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2);
0378     QChar ch;
0379     int x;
0380     bool startOfLine = true;
0381
0382     for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) {
0383         ch = helper.mText.at(helper.mPos);
0384         if (flags & PreserveSpaces) {
0385             if (ch == QLatin1Char(' ')) {
0386                 if (helper.mPos + 1 < helper.mText.length()) {
0387                     if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) {
0388                         // A single space, make it breaking if not at the start or end of the line
0389                         const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n');
0390                         if (!startOfLine && !endOfLine) {
0391                             result += QLatin1Char(' ');
0392                         } else {
0393                             result += QLatin1String("&nbsp;");
0394                         }
0395                     } else {
0396                         // Whitespace of more than one space, make it all non-breaking
0397                         while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) {
0398                             result += QLatin1String("&nbsp;");
0399                             ++helper.mPos;
0400                             ++x;
0401                         }
0402
0403                         // We incremented once to often, undo that
0404                         --helper.mPos;
0405                         --x;
0406                     }
0407                 } else {
0408                     // Last space in the text, it is non-breaking
0409                     result += QLatin1String("&nbsp;");
0410                 }
0411
0412                 if (startOfLine) {
0413                     startOfLine = false;
0414                 }
0415                 continue;
0416             } else if (ch == QLatin1Char('\t')) {
0417                 do {
0418                     result += QLatin1String("&nbsp;");
0419                     ++x;
0420                 } while ((x & 7) != 0);
0421                 --x;
0422                 startOfLine = false;
0423                 continue;
0424             }
0425         }
0426         if (ch == QLatin1Char('\n')) {
0427             result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
0428             startOfLine = true;
0429             x = -1;
0430             continue;
0431         }
0432
0433         startOfLine = false;
0434         if (ch == QLatin1Char('&')) {
0435             result += QLatin1String("&amp;");
0436         } else if (ch == QLatin1Char('"')) {
0437             result += QLatin1String("&quot;");
0438         } else if (ch == QLatin1Char('<')) {
0439             result += QLatin1String("&lt;");
0440         } else if (ch == QLatin1Char('>')) {
0441             result += QLatin1String("&gt;");
0442         } else {
0443             const int start = helper.mPos;
0444             if (!(flags & IgnoreUrls)) {
0445                 bool badUrl = false;
0446                 str = helper.getUrl(&badUrl);
0447                 if (badUrl) {
0448                     QString resultBadUrl;
0449                     const int helperTextSize(helper.mText.length());
0450                     for (int i = 0; i < helperTextSize; ++i) {
0451                         const QChar chBadUrl = helper.mText.at(i);
0452                         if (chBadUrl == QLatin1Char('&')) {
0453                             resultBadUrl += QLatin1String("&amp;");
0454                         } else if (chBadUrl == QLatin1Char('"')) {
0455                             resultBadUrl += QLatin1String("&quot;");
0456                         } else if (chBadUrl == QLatin1Char('<')) {
0457                             resultBadUrl += QLatin1String("&lt;");
0458                         } else if (chBadUrl == QLatin1Char('>')) {
0459                             resultBadUrl += QLatin1String("&gt;");
0460                         } else {
0461                             resultBadUrl += chBadUrl;
0462                         }
0463                     }
0464                     return resultBadUrl;
0465                 }
0466                 if (!str.isEmpty()) {
0467                     QString hyperlink;
0468                     if (str.startsWith(QLatin1String("www."))) {
0469                         hyperlink = QLatin1String("http://") + str;
0470                     } else if (str.startsWith(QLatin1String("ftp."))) {
0471                         hyperlink = QLatin1String("ftp://") + str;
0472                     } else {
0473                         hyperlink = str;
0474                     }
0475                     if (hyperlink.endsWith(QLatin1Char('"'))) {
0476                         hyperlink.chop(1);
0477                     }
0478                     if (str.endsWith(QLatin1Char('"'))) {
0479                         str.chop(1);
0480                     }
0481                     result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>");
0482                     x += helper.mPos - start;
0483                     continue;
0484                 }
0485                 str = helper.getEmailAddress();
0486                 if (!str.isEmpty()) {
0487                     // len is the length of the local part
0488                     int len = str.indexOf(QLatin1Char('@'));
0489                     QString localPart = str.left(len);
0490
0491                     // remove the local part from the result (as '&'s have been expanded to
0492                     // &amp; we have to take care of the 4 additional characters per '&')
0493                     result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4));
0494                     x -= len;
0495
0496                     result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
0497                     x += str.length() - 1;
0498                     continue;
0499                 }
0500                 if (flags & ConvertPhoneNumbers) {
0501                     str = helper.getPhoneNumber();
0502                     if (!str.isEmpty()) {
0503                         result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>");
0504                         x += str.length() - 1;
0505                         continue;
0506                     }
0507                 }
0508             }
0509             if (flags & HighlightText) {
0510                 str = helper.highlightedText();
0511                 if (!str.isEmpty()) {
0512                     result += str;
0513                     x += helper.mPos - start;
0514                     continue;
0515                 }
0516             }
0517             result += ch;
0518         }
0519     }
0520
0521     return result;
0522 }