File indexing completed on 2024-05-12 05:28:16

0001 // SPDX-FileCopyrightText: 2017 Christian Mollekopf <mollekopf@kolabsys.com>
0002 // SPDX-License-Identifier: LGPL-2.0-or-later
0003 
0004 #include "htmlutils.h"
0005 
0006 #include <QMap>
0007 #include <QString>
0008 #include <QUrl>
0009 
0010 static QString resolveEntities(const QString &in)
0011 {
0012     QString out;
0013 
0014     for (int i = 0; i < (int)in.length(); ++i) {
0015         if (in[i] == QLatin1Char('&')) {
0016             // find a semicolon
0017             ++i;
0018             int n = in.indexOf(QLatin1Char(';'), i);
0019             if (n == -1) {
0020                 break;
0021             }
0022             QString type = in.mid(i, (n - i));
0023             i = n; // should be n+1, but we'll let the loop increment do it
0024 
0025             if (type == QLatin1String("amp")) {
0026                 out += QLatin1Char('&');
0027             } else if (type == QLatin1String("lt"))
0028                 out += QLatin1Char('<');
0029             else if (type == QLatin1String("gt"))
0030                 out += QLatin1Char('>');
0031             else if (type == QLatin1String("quot"))
0032                 out += QLatin1Char('\"');
0033             else if (type == QLatin1String("apos"))
0034                 out += QLatin1Char('\'');
0035             else if (type == QLatin1String("nbsp"))
0036                 out += 0xa0;
0037         } else {
0038             out += in[i];
0039         }
0040     }
0041 
0042     return out;
0043 }
0044 
0045 static bool linkify_pmatch(const QString &str1, int at, const QString &str2)
0046 {
0047     if (str2.length() > (str1.length() - at))
0048         return false;
0049 
0050     for (int n = 0; n < (int)str2.length(); ++n) {
0051         if (str1.at(n + at).toLower() != str2.at(n).toLower())
0052             return false;
0053     }
0054 
0055     return true;
0056 }
0057 
0058 static bool linkify_isOneOf(const QChar &c, const QString &charlist)
0059 {
0060     for (int i = 0; i < (int)charlist.length(); ++i) {
0061         if (c == charlist.at(i))
0062             return true;
0063     }
0064 
0065     return false;
0066 }
0067 
0068 // encodes a few dangerous html characters
0069 static QString linkify_htmlsafe(const QString &in)
0070 {
0071     QString out;
0072 
0073     for (int n = 0; n < in.length(); ++n) {
0074         if (linkify_isOneOf(in.at(n), QStringLiteral("\"\'`<>"))) {
0075             // hex encode
0076             QString hex;
0077             hex.asprintf("%%%02X", in.at(n).toLatin1());
0078             out.append(hex);
0079         } else {
0080             out.append(in.at(n));
0081         }
0082     }
0083 
0084     return out;
0085 }
0086 
0087 static bool linkify_okUrl(const QString &url)
0088 {
0089     if (url.at(url.length() - 1) == QLatin1Char('.'))
0090         return false;
0091 
0092     return true;
0093 }
0094 
0095 static bool linkify_okEmail(const QString &addy)
0096 {
0097     // this makes sure that there is an '@' and a '.' after it, and that there is
0098     // at least one char for each of the three sections
0099     int n = addy.indexOf(QLatin1Char('@'));
0100     if (n == -1 || n == 0)
0101         return false;
0102     int d = addy.indexOf(QLatin1Char('.'), n + 1);
0103     if (d == -1 || d == 0)
0104         return false;
0105     if ((addy.length() - 1) - d <= 0)
0106         return false;
0107     if (addy.indexOf(QStringLiteral("..")) != -1)
0108         return false;
0109 
0110     return true;
0111 }
0112 
0113 /**
0114  * takes a richtext string and heuristically adds links for uris of common protocols
0115  * @return a richtext string with link markup added
0116  */
0117 QString HtmlUtils::linkify(const QString &in)
0118 {
0119     QString out = in;
0120     int x1, x2;
0121     bool isUrl, isAtStyle;
0122     QString linked, link, href;
0123 
0124     for (int n = 0; n < (int)out.length(); ++n) {
0125         isUrl = false;
0126         isAtStyle = false;
0127         x1 = n;
0128 
0129         if (linkify_pmatch(out, n, QStringLiteral("xmpp:"))) {
0130             n += 5;
0131             isUrl = true;
0132             href = QString();
0133         } else if (linkify_pmatch(out, n, QStringLiteral("mailto:"))) {
0134             n += 7;
0135             isUrl = true;
0136             href = QString();
0137         } else if (linkify_pmatch(out, n, QStringLiteral("http://"))) {
0138             n += 7;
0139             isUrl = true;
0140             href = QString();
0141         } else if (linkify_pmatch(out, n, QStringLiteral("https://"))) {
0142             n += 8;
0143             isUrl = true;
0144             href = QString();
0145         } else if (linkify_pmatch(out, n, QStringLiteral("ftp://"))) {
0146             n += 6;
0147             isUrl = true;
0148             href = QString();
0149         } else if (linkify_pmatch(out, n, QStringLiteral("news://"))) {
0150             n += 7;
0151             isUrl = true;
0152             href = QString();
0153         } else if (linkify_pmatch(out, n, QStringLiteral("ed2k://"))) {
0154             n += 7;
0155             isUrl = true;
0156             href = QString();
0157         } else if (linkify_pmatch(out, n, QStringLiteral("magnet:"))) {
0158             n += 7;
0159             isUrl = true;
0160             href = QString();
0161         } else if (linkify_pmatch(out, n, QStringLiteral("www."))) {
0162             isUrl = true;
0163             href = QStringLiteral("http://");
0164         } else if (linkify_pmatch(out, n, QStringLiteral("ftp."))) {
0165             isUrl = true;
0166             href = QStringLiteral("ftp://");
0167         } else if (linkify_pmatch(out, n, QStringLiteral("@"))) {
0168             isAtStyle = true;
0169             href = QStringLiteral("x-psi-atstyle:");
0170         }
0171 
0172         if (isUrl) {
0173             // make sure the previous char is not alphanumeric
0174             if (x1 > 0 && out.at(x1 - 1).isLetterOrNumber())
0175                 continue;
0176 
0177             // find whitespace (or end)
0178             QMap<QChar, int> brackets;
0179             brackets[QLatin1Char('(')] = brackets[QLatin1Char(')')] = brackets[QLatin1Char('[')] = brackets[QLatin1Char(']')] = brackets[QLatin1Char('{')] =
0180                 brackets[QLatin1Char('}')] = 0;
0181             QMap<QChar, QChar> openingBracket;
0182             openingBracket[QLatin1Char(')')] = QLatin1Char('(');
0183             openingBracket[QLatin1Char(']')] = QLatin1Char('[');
0184             openingBracket[QLatin1Char('}')] = QLatin1Char('{');
0185             for (x2 = n; x2 < (int)out.length(); ++x2) {
0186                 if (out.at(x2).isSpace() || linkify_isOneOf(out.at(x2), QStringLiteral("\"\'`<>")) || linkify_pmatch(out, x2, QStringLiteral("&quot;"))
0187                     || linkify_pmatch(out, x2, QStringLiteral("&apos;")) || linkify_pmatch(out, x2, QStringLiteral("&gt;"))
0188                     || linkify_pmatch(out, x2, QStringLiteral("&lt;"))) {
0189                     break;
0190                 }
0191                 if (brackets.keys().contains(out.at(x2))) {
0192                     ++brackets[out.at(x2)];
0193                 }
0194             }
0195             int len = x2 - x1;
0196             QString pre = resolveEntities(out.mid(x1, x2 - x1));
0197 
0198             // go backward hacking off unwanted punctuation
0199             int cutoff;
0200             for (cutoff = pre.length() - 1; cutoff >= 0; --cutoff) {
0201                 if (!linkify_isOneOf(pre.at(cutoff), QStringLiteral("!?,.()[]{}<>\"")))
0202                     break;
0203                 if (linkify_isOneOf(pre.at(cutoff), QStringLiteral(")]}")) && brackets[pre.at(cutoff)] - brackets[openingBracket[pre.at(cutoff)]] <= 0) {
0204                     break; // in theory, there could be == above, but these are urls, not math ;)
0205                 }
0206                 if (brackets.keys().contains(pre.at(cutoff))) {
0207                     --brackets[pre.at(cutoff)];
0208                 }
0209             }
0210             ++cutoff;
0211             //++x2;
0212 
0213             link = pre.mid(0, cutoff);
0214             if (!linkify_okUrl(link)) {
0215                 n = x1 + link.length();
0216                 continue;
0217             }
0218             href += link;
0219             // attributes need to be encoded too.
0220             href = href.toHtmlEscaped();
0221             href = linkify_htmlsafe(href);
0222             // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1());
0223             linked = QStringLiteral("<a href=\"%1\">").arg(href) + QUrl{link}.toDisplayString(QUrl::RemoveQuery) + QStringLiteral("</a>")
0224                 + pre.mid(cutoff).toHtmlEscaped();
0225             out.replace(x1, len, linked);
0226             n = x1 + linked.length() - 1;
0227         } else if (isAtStyle) {
0228             // go backward till we find the beginning
0229             if (x1 == 0)
0230                 continue;
0231             --x1;
0232             for (; x1 >= 0; --x1) {
0233                 if (!linkify_isOneOf(out.at(x1), QStringLiteral("_.-+")) && !out.at(x1).isLetterOrNumber())
0234                     break;
0235             }
0236             ++x1;
0237 
0238             // go forward till we find the end
0239             x2 = n + 1;
0240             for (; x2 < (int)out.length(); ++x2) {
0241                 if (!linkify_isOneOf(out.at(x2), QStringLiteral("_.-+")) && !out.at(x2).isLetterOrNumber())
0242                     break;
0243             }
0244 
0245             int len = x2 - x1;
0246             link = out.mid(x1, len);
0247             // link = resolveEntities(link);
0248 
0249             if (!linkify_okEmail(link)) {
0250                 n = x1 + link.length();
0251                 continue;
0252             }
0253 
0254             href += link;
0255             // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1());
0256             linked = QStringLiteral("<a href=\"%1\">").arg(href) + link + QStringLiteral("</a>");
0257             out.replace(x1, len, linked);
0258             n = x1 + linked.length() - 1;
0259         }
0260     }
0261 
0262     return out;
0263 }