File indexing completed on 2024-12-22 05:05:19

0001 // SPDX-FileCopyrightText: 2017 Christian Mollekopf <mollekopf@kolabsys.com>
0002 // SPDX-License-Identifier: LGPL-2.0-or-later
0003 
0004 #include "htmlutils.h"
0005 
0006 #include <QMap>
0007 #include <QUrl>
0008 
0009 static QString resolveEntities(const QString &in)
0010 {
0011     QString out;
0012 
0013     for (int i = 0; i < (int)in.length(); ++i) {
0014         if (in[i] == QLatin1Char('&')) {
0015             // find a semicolon
0016             ++i;
0017             int n = in.indexOf(QLatin1Char(';'), i);
0018             if (n == -1) {
0019                 break;
0020             }
0021             QString type = in.mid(i, (n - i));
0022             i = n; // should be n+1, but we'll let the loop increment do it
0023 
0024             if (type == QLatin1StringView("amp")) {
0025                 out += QLatin1Char('&');
0026             } else if (type == QLatin1StringView("lt"))
0027                 out += QLatin1Char('<');
0028             else if (type == QLatin1StringView("gt"))
0029                 out += QLatin1Char('>');
0030             else if (type == QLatin1StringView("quot"))
0031                 out += QLatin1Char('\"');
0032             else if (type == QLatin1StringView("apos"))
0033                 out += QLatin1Char('\'');
0034             else if (type == QLatin1StringView("nbsp"))
0035                 out += QChar(0xa0);
0036         } else {
0037             out += in[i];
0038         }
0039     }
0040 
0041     return out;
0042 }
0043 
0044 static bool linkify_pmatch(const QString &str1, int at, const QString &str2)
0045 {
0046     if (str2.length() > (str1.length() - at))
0047         return false;
0048 
0049     for (int n = 0; n < (int)str2.length(); ++n) {
0050         if (str1.at(n + at).toLower() != str2.at(n).toLower())
0051             return false;
0052     }
0053 
0054     return true;
0055 }
0056 
0057 static bool linkify_isOneOf(const QChar &c, const QString &charlist)
0058 {
0059     for (int i = 0; i < (int)charlist.length(); ++i) {
0060         if (c == charlist.at(i))
0061             return true;
0062     }
0063 
0064     return false;
0065 }
0066 
0067 // encodes a few dangerous html characters
0068 static QString linkify_htmlsafe(const QString &in)
0069 {
0070     QString out;
0071 
0072     for (int n = 0; n < in.length(); ++n) {
0073         if (linkify_isOneOf(in.at(n), QStringLiteral("\"\'`<>"))) {
0074             // hex encode
0075             QString hex;
0076             hex.asprintf("%%%02X", in.at(n).toLatin1());
0077             out.append(hex);
0078         } else {
0079             out.append(in.at(n));
0080         }
0081     }
0082 
0083     return out;
0084 }
0085 
0086 static bool linkify_okUrl(const QString &url)
0087 {
0088     if (url.at(url.length() - 1) == QLatin1Char('.'))
0089         return false;
0090 
0091     return true;
0092 }
0093 
0094 static bool linkify_okEmail(const QString &addy)
0095 {
0096     // this makes sure that there is an '@' and a '.' after it, and that there is
0097     // at least one char for each of the three sections
0098     int n = addy.indexOf(QLatin1Char('@'));
0099     if (n == -1 || n == 0)
0100         return false;
0101     int d = addy.indexOf(QLatin1Char('.'), n + 1);
0102     if (d == -1 || d == 0)
0103         return false;
0104     if ((addy.length() - 1) - d <= 0)
0105         return false;
0106     if (addy.indexOf(QStringLiteral("..")) != -1)
0107         return false;
0108 
0109     return true;
0110 }
0111 
0112 /**
0113  * takes a richtext string and heuristically adds links for uris of common protocols
0114  * @return a richtext string with link markup added
0115  */
0116 QString MimeTreeParser::linkify(const QString &in)
0117 {
0118     QString out = in;
0119     int x1, x2;
0120     QString linked, link, href;
0121 
0122     for (int n = 0; n < (int)out.length(); ++n) {
0123         bool isUrl = false;
0124         bool isAtStyle = false;
0125         x1 = n;
0126 
0127         if (linkify_pmatch(out, n, QStringLiteral("xmpp:"))) {
0128             n += 5;
0129             isUrl = true;
0130             href = QString();
0131         } else if (linkify_pmatch(out, n, QStringLiteral("mailto:"))) {
0132             n += 7;
0133             isUrl = true;
0134             href = QString();
0135         } else if (linkify_pmatch(out, n, QStringLiteral("http://"))) {
0136             n += 7;
0137             isUrl = true;
0138             href = QString();
0139         } else if (linkify_pmatch(out, n, QStringLiteral("https://"))) {
0140             n += 8;
0141             isUrl = true;
0142             href = QString();
0143         } else if (linkify_pmatch(out, n, QStringLiteral("ftp://"))) {
0144             n += 6;
0145             isUrl = true;
0146             href = QString();
0147         } else if (linkify_pmatch(out, n, QStringLiteral("news://"))) {
0148             n += 7;
0149             isUrl = true;
0150             href = QString();
0151         } else if (linkify_pmatch(out, n, QStringLiteral("ed2k://"))) {
0152             n += 7;
0153             isUrl = true;
0154             href = QString();
0155         } else if (linkify_pmatch(out, n, QStringLiteral("magnet:"))) {
0156             n += 7;
0157             isUrl = true;
0158             href = QString();
0159         } else if (linkify_pmatch(out, n, QStringLiteral("www."))) {
0160             isUrl = true;
0161             href = QStringLiteral("http://");
0162         } else if (linkify_pmatch(out, n, QStringLiteral("ftp."))) {
0163             isUrl = true;
0164             href = QStringLiteral("ftp://");
0165         } else if (linkify_pmatch(out, n, QStringLiteral("@"))) {
0166             isAtStyle = true;
0167             href = QStringLiteral("x-psi-atstyle:");
0168         }
0169 
0170         if (isUrl) {
0171             // make sure the previous char is not alphanumeric
0172             if (x1 > 0 && out.at(x1 - 1).isLetterOrNumber())
0173                 continue;
0174 
0175             // find whitespace (or end)
0176             QMap<QChar, int> brackets;
0177             brackets[QLatin1Char('(')] = brackets[QLatin1Char(')')] = brackets[QLatin1Char('[')] = brackets[QLatin1Char(']')] = brackets[QLatin1Char('{')] =
0178                 brackets[QLatin1Char('}')] = 0;
0179             QMap<QChar, QChar> openingBracket;
0180             openingBracket[QLatin1Char(')')] = QLatin1Char('(');
0181             openingBracket[QLatin1Char(']')] = QLatin1Char('[');
0182             openingBracket[QLatin1Char('}')] = QLatin1Char('{');
0183             for (x2 = n; x2 < (int)out.length(); ++x2) {
0184                 if (out.at(x2).isSpace() || linkify_isOneOf(out.at(x2), QStringLiteral("\"\'`<>")) || linkify_pmatch(out, x2, QStringLiteral("&quot;"))
0185                     || linkify_pmatch(out, x2, QStringLiteral("&apos;")) || linkify_pmatch(out, x2, QStringLiteral("&gt;"))
0186                     || linkify_pmatch(out, x2, QStringLiteral("&lt;"))) {
0187                     break;
0188                 }
0189                 if (brackets.contains(out.at(x2))) {
0190                     ++brackets[out.at(x2)];
0191                 }
0192             }
0193             int len = x2 - x1;
0194             QString pre = resolveEntities(out.mid(x1, x2 - x1));
0195 
0196             // go backward hacking off unwanted punctuation
0197             int cutoff;
0198             for (cutoff = pre.length() - 1; cutoff >= 0; --cutoff) {
0199                 if (!linkify_isOneOf(pre.at(cutoff), QStringLiteral("!?,.()[]{}<>\"")))
0200                     break;
0201                 if (linkify_isOneOf(pre.at(cutoff), QStringLiteral(")]}")) && brackets[pre.at(cutoff)] - brackets[openingBracket[pre.at(cutoff)]] <= 0) {
0202                     break; // in theory, there could be == above, but these are urls, not math ;)
0203                 }
0204                 if (brackets.contains(pre.at(cutoff))) {
0205                     --brackets[pre.at(cutoff)];
0206                 }
0207             }
0208             ++cutoff;
0209             //++x2;
0210 
0211             link = pre.mid(0, cutoff);
0212             if (!linkify_okUrl(link)) {
0213                 n = x1 + link.length();
0214                 continue;
0215             }
0216             href += link;
0217             // attributes need to be encoded too.
0218             href = href.toHtmlEscaped();
0219             href = linkify_htmlsafe(href);
0220             // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1());
0221             linked = QStringLiteral("<a href=\"%1\">").arg(href) + QUrl{link}.toDisplayString(QUrl::RemoveQuery) + QStringLiteral("</a>")
0222                 + pre.mid(cutoff).toHtmlEscaped();
0223             out.replace(x1, len, linked);
0224             n = x1 + linked.length() - 1;
0225         } else if (isAtStyle) {
0226             // go backward till we find the beginning
0227             if (x1 == 0)
0228                 continue;
0229             --x1;
0230             for (; x1 >= 0; --x1) {
0231                 if (!linkify_isOneOf(out.at(x1), QStringLiteral("_.-+")) && !out.at(x1).isLetterOrNumber())
0232                     break;
0233             }
0234             ++x1;
0235 
0236             // go forward till we find the end
0237             x2 = n + 1;
0238             for (; x2 < (int)out.length(); ++x2) {
0239                 if (!linkify_isOneOf(out.at(x2), QStringLiteral("_.-+")) && !out.at(x2).isLetterOrNumber())
0240                     break;
0241             }
0242 
0243             int len = x2 - x1;
0244             link = out.mid(x1, len);
0245             // link = resolveEntities(link);
0246 
0247             if (!linkify_okEmail(link)) {
0248                 n = x1 + link.length();
0249                 continue;
0250             }
0251 
0252             href += link;
0253             // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1());
0254             linked = QStringLiteral("<a href=\"%1\">").arg(href) + link + QStringLiteral("</a>");
0255             out.replace(x1, len, linked);
0256             n = x1 + linked.length() - 1;
0257         }
0258     }
0259 
0260     return out;
0261 }