File indexing completed on 2024-12-22 05:05:19
0001 // SPDX-FileCopyrightText: 2017 Christian Mollekopf <mollekopf@kolabsys.com> 0002 // SPDX-License-Identifier: LGPL-2.0-or-later 0003 0004 #include "htmlutils.h" 0005 0006 #include <QMap> 0007 #include <QUrl> 0008 0009 static QString resolveEntities(const QString &in) 0010 { 0011 QString out; 0012 0013 for (int i = 0; i < (int)in.length(); ++i) { 0014 if (in[i] == QLatin1Char('&')) { 0015 // find a semicolon 0016 ++i; 0017 int n = in.indexOf(QLatin1Char(';'), i); 0018 if (n == -1) { 0019 break; 0020 } 0021 QString type = in.mid(i, (n - i)); 0022 i = n; // should be n+1, but we'll let the loop increment do it 0023 0024 if (type == QLatin1StringView("amp")) { 0025 out += QLatin1Char('&'); 0026 } else if (type == QLatin1StringView("lt")) 0027 out += QLatin1Char('<'); 0028 else if (type == QLatin1StringView("gt")) 0029 out += QLatin1Char('>'); 0030 else if (type == QLatin1StringView("quot")) 0031 out += QLatin1Char('\"'); 0032 else if (type == QLatin1StringView("apos")) 0033 out += QLatin1Char('\''); 0034 else if (type == QLatin1StringView("nbsp")) 0035 out += QChar(0xa0); 0036 } else { 0037 out += in[i]; 0038 } 0039 } 0040 0041 return out; 0042 } 0043 0044 static bool linkify_pmatch(const QString &str1, int at, const QString &str2) 0045 { 0046 if (str2.length() > (str1.length() - at)) 0047 return false; 0048 0049 for (int n = 0; n < (int)str2.length(); ++n) { 0050 if (str1.at(n + at).toLower() != str2.at(n).toLower()) 0051 return false; 0052 } 0053 0054 return true; 0055 } 0056 0057 static bool linkify_isOneOf(const QChar &c, const QString &charlist) 0058 { 0059 for (int i = 0; i < (int)charlist.length(); ++i) { 0060 if (c == charlist.at(i)) 0061 return true; 0062 } 0063 0064 return false; 0065 } 0066 0067 // encodes a few dangerous html characters 0068 static QString linkify_htmlsafe(const QString &in) 0069 { 0070 QString out; 0071 0072 for (int n = 0; n < in.length(); ++n) { 0073 if (linkify_isOneOf(in.at(n), QStringLiteral("\"\'`<>"))) { 0074 // hex encode 0075 QString hex; 0076 hex.asprintf("%%%02X", in.at(n).toLatin1()); 0077 out.append(hex); 0078 } else { 0079 out.append(in.at(n)); 0080 } 0081 } 0082 0083 return out; 0084 } 0085 0086 static bool linkify_okUrl(const QString &url) 0087 { 0088 if (url.at(url.length() - 1) == QLatin1Char('.')) 0089 return false; 0090 0091 return true; 0092 } 0093 0094 static bool linkify_okEmail(const QString &addy) 0095 { 0096 // this makes sure that there is an '@' and a '.' after it, and that there is 0097 // at least one char for each of the three sections 0098 int n = addy.indexOf(QLatin1Char('@')); 0099 if (n == -1 || n == 0) 0100 return false; 0101 int d = addy.indexOf(QLatin1Char('.'), n + 1); 0102 if (d == -1 || d == 0) 0103 return false; 0104 if ((addy.length() - 1) - d <= 0) 0105 return false; 0106 if (addy.indexOf(QStringLiteral("..")) != -1) 0107 return false; 0108 0109 return true; 0110 } 0111 0112 /** 0113 * takes a richtext string and heuristically adds links for uris of common protocols 0114 * @return a richtext string with link markup added 0115 */ 0116 QString MimeTreeParser::linkify(const QString &in) 0117 { 0118 QString out = in; 0119 int x1, x2; 0120 QString linked, link, href; 0121 0122 for (int n = 0; n < (int)out.length(); ++n) { 0123 bool isUrl = false; 0124 bool isAtStyle = false; 0125 x1 = n; 0126 0127 if (linkify_pmatch(out, n, QStringLiteral("xmpp:"))) { 0128 n += 5; 0129 isUrl = true; 0130 href = QString(); 0131 } else if (linkify_pmatch(out, n, QStringLiteral("mailto:"))) { 0132 n += 7; 0133 isUrl = true; 0134 href = QString(); 0135 } else if (linkify_pmatch(out, n, QStringLiteral("http://"))) { 0136 n += 7; 0137 isUrl = true; 0138 href = QString(); 0139 } else if (linkify_pmatch(out, n, QStringLiteral("https://"))) { 0140 n += 8; 0141 isUrl = true; 0142 href = QString(); 0143 } else if (linkify_pmatch(out, n, QStringLiteral("ftp://"))) { 0144 n += 6; 0145 isUrl = true; 0146 href = QString(); 0147 } else if (linkify_pmatch(out, n, QStringLiteral("news://"))) { 0148 n += 7; 0149 isUrl = true; 0150 href = QString(); 0151 } else if (linkify_pmatch(out, n, QStringLiteral("ed2k://"))) { 0152 n += 7; 0153 isUrl = true; 0154 href = QString(); 0155 } else if (linkify_pmatch(out, n, QStringLiteral("magnet:"))) { 0156 n += 7; 0157 isUrl = true; 0158 href = QString(); 0159 } else if (linkify_pmatch(out, n, QStringLiteral("www."))) { 0160 isUrl = true; 0161 href = QStringLiteral("http://"); 0162 } else if (linkify_pmatch(out, n, QStringLiteral("ftp."))) { 0163 isUrl = true; 0164 href = QStringLiteral("ftp://"); 0165 } else if (linkify_pmatch(out, n, QStringLiteral("@"))) { 0166 isAtStyle = true; 0167 href = QStringLiteral("x-psi-atstyle:"); 0168 } 0169 0170 if (isUrl) { 0171 // make sure the previous char is not alphanumeric 0172 if (x1 > 0 && out.at(x1 - 1).isLetterOrNumber()) 0173 continue; 0174 0175 // find whitespace (or end) 0176 QMap<QChar, int> brackets; 0177 brackets[QLatin1Char('(')] = brackets[QLatin1Char(')')] = brackets[QLatin1Char('[')] = brackets[QLatin1Char(']')] = brackets[QLatin1Char('{')] = 0178 brackets[QLatin1Char('}')] = 0; 0179 QMap<QChar, QChar> openingBracket; 0180 openingBracket[QLatin1Char(')')] = QLatin1Char('('); 0181 openingBracket[QLatin1Char(']')] = QLatin1Char('['); 0182 openingBracket[QLatin1Char('}')] = QLatin1Char('{'); 0183 for (x2 = n; x2 < (int)out.length(); ++x2) { 0184 if (out.at(x2).isSpace() || linkify_isOneOf(out.at(x2), QStringLiteral("\"\'`<>")) || linkify_pmatch(out, x2, QStringLiteral(""")) 0185 || linkify_pmatch(out, x2, QStringLiteral("'")) || linkify_pmatch(out, x2, QStringLiteral(">")) 0186 || linkify_pmatch(out, x2, QStringLiteral("<"))) { 0187 break; 0188 } 0189 if (brackets.contains(out.at(x2))) { 0190 ++brackets[out.at(x2)]; 0191 } 0192 } 0193 int len = x2 - x1; 0194 QString pre = resolveEntities(out.mid(x1, x2 - x1)); 0195 0196 // go backward hacking off unwanted punctuation 0197 int cutoff; 0198 for (cutoff = pre.length() - 1; cutoff >= 0; --cutoff) { 0199 if (!linkify_isOneOf(pre.at(cutoff), QStringLiteral("!?,.()[]{}<>\""))) 0200 break; 0201 if (linkify_isOneOf(pre.at(cutoff), QStringLiteral(")]}")) && brackets[pre.at(cutoff)] - brackets[openingBracket[pre.at(cutoff)]] <= 0) { 0202 break; // in theory, there could be == above, but these are urls, not math ;) 0203 } 0204 if (brackets.contains(pre.at(cutoff))) { 0205 --brackets[pre.at(cutoff)]; 0206 } 0207 } 0208 ++cutoff; 0209 //++x2; 0210 0211 link = pre.mid(0, cutoff); 0212 if (!linkify_okUrl(link)) { 0213 n = x1 + link.length(); 0214 continue; 0215 } 0216 href += link; 0217 // attributes need to be encoded too. 0218 href = href.toHtmlEscaped(); 0219 href = linkify_htmlsafe(href); 0220 // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1()); 0221 linked = QStringLiteral("<a href=\"%1\">").arg(href) + QUrl{link}.toDisplayString(QUrl::RemoveQuery) + QStringLiteral("</a>") 0222 + pre.mid(cutoff).toHtmlEscaped(); 0223 out.replace(x1, len, linked); 0224 n = x1 + linked.length() - 1; 0225 } else if (isAtStyle) { 0226 // go backward till we find the beginning 0227 if (x1 == 0) 0228 continue; 0229 --x1; 0230 for (; x1 >= 0; --x1) { 0231 if (!linkify_isOneOf(out.at(x1), QStringLiteral("_.-+")) && !out.at(x1).isLetterOrNumber()) 0232 break; 0233 } 0234 ++x1; 0235 0236 // go forward till we find the end 0237 x2 = n + 1; 0238 for (; x2 < (int)out.length(); ++x2) { 0239 if (!linkify_isOneOf(out.at(x2), QStringLiteral("_.-+")) && !out.at(x2).isLetterOrNumber()) 0240 break; 0241 } 0242 0243 int len = x2 - x1; 0244 link = out.mid(x1, len); 0245 // link = resolveEntities(link); 0246 0247 if (!linkify_okEmail(link)) { 0248 n = x1 + link.length(); 0249 continue; 0250 } 0251 0252 href += link; 0253 // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1()); 0254 linked = QStringLiteral("<a href=\"%1\">").arg(href) + link + QStringLiteral("</a>"); 0255 out.replace(x1, len, linked); 0256 n = x1 + linked.length() - 1; 0257 } 0258 } 0259 0260 return out; 0261 }