File indexing completed on 2024-05-12 05:28:16
0001 // SPDX-FileCopyrightText: 2017 Christian Mollekopf <mollekopf@kolabsys.com> 0002 // SPDX-License-Identifier: LGPL-2.0-or-later 0003 0004 #include "htmlutils.h" 0005 0006 #include <QMap> 0007 #include <QString> 0008 #include <QUrl> 0009 0010 static QString resolveEntities(const QString &in) 0011 { 0012 QString out; 0013 0014 for (int i = 0; i < (int)in.length(); ++i) { 0015 if (in[i] == QLatin1Char('&')) { 0016 // find a semicolon 0017 ++i; 0018 int n = in.indexOf(QLatin1Char(';'), i); 0019 if (n == -1) { 0020 break; 0021 } 0022 QString type = in.mid(i, (n - i)); 0023 i = n; // should be n+1, but we'll let the loop increment do it 0024 0025 if (type == QLatin1String("amp")) { 0026 out += QLatin1Char('&'); 0027 } else if (type == QLatin1String("lt")) 0028 out += QLatin1Char('<'); 0029 else if (type == QLatin1String("gt")) 0030 out += QLatin1Char('>'); 0031 else if (type == QLatin1String("quot")) 0032 out += QLatin1Char('\"'); 0033 else if (type == QLatin1String("apos")) 0034 out += QLatin1Char('\''); 0035 else if (type == QLatin1String("nbsp")) 0036 out += 0xa0; 0037 } else { 0038 out += in[i]; 0039 } 0040 } 0041 0042 return out; 0043 } 0044 0045 static bool linkify_pmatch(const QString &str1, int at, const QString &str2) 0046 { 0047 if (str2.length() > (str1.length() - at)) 0048 return false; 0049 0050 for (int n = 0; n < (int)str2.length(); ++n) { 0051 if (str1.at(n + at).toLower() != str2.at(n).toLower()) 0052 return false; 0053 } 0054 0055 return true; 0056 } 0057 0058 static bool linkify_isOneOf(const QChar &c, const QString &charlist) 0059 { 0060 for (int i = 0; i < (int)charlist.length(); ++i) { 0061 if (c == charlist.at(i)) 0062 return true; 0063 } 0064 0065 return false; 0066 } 0067 0068 // encodes a few dangerous html characters 0069 static QString linkify_htmlsafe(const QString &in) 0070 { 0071 QString out; 0072 0073 for (int n = 0; n < in.length(); ++n) { 0074 if (linkify_isOneOf(in.at(n), QStringLiteral("\"\'`<>"))) { 0075 // hex encode 0076 QString hex; 0077 hex.asprintf("%%%02X", in.at(n).toLatin1()); 0078 out.append(hex); 0079 } else { 0080 out.append(in.at(n)); 0081 } 0082 } 0083 0084 return out; 0085 } 0086 0087 static bool linkify_okUrl(const QString &url) 0088 { 0089 if (url.at(url.length() - 1) == QLatin1Char('.')) 0090 return false; 0091 0092 return true; 0093 } 0094 0095 static bool linkify_okEmail(const QString &addy) 0096 { 0097 // this makes sure that there is an '@' and a '.' after it, and that there is 0098 // at least one char for each of the three sections 0099 int n = addy.indexOf(QLatin1Char('@')); 0100 if (n == -1 || n == 0) 0101 return false; 0102 int d = addy.indexOf(QLatin1Char('.'), n + 1); 0103 if (d == -1 || d == 0) 0104 return false; 0105 if ((addy.length() - 1) - d <= 0) 0106 return false; 0107 if (addy.indexOf(QStringLiteral("..")) != -1) 0108 return false; 0109 0110 return true; 0111 } 0112 0113 /** 0114 * takes a richtext string and heuristically adds links for uris of common protocols 0115 * @return a richtext string with link markup added 0116 */ 0117 QString HtmlUtils::linkify(const QString &in) 0118 { 0119 QString out = in; 0120 int x1, x2; 0121 bool isUrl, isAtStyle; 0122 QString linked, link, href; 0123 0124 for (int n = 0; n < (int)out.length(); ++n) { 0125 isUrl = false; 0126 isAtStyle = false; 0127 x1 = n; 0128 0129 if (linkify_pmatch(out, n, QStringLiteral("xmpp:"))) { 0130 n += 5; 0131 isUrl = true; 0132 href = QString(); 0133 } else if (linkify_pmatch(out, n, QStringLiteral("mailto:"))) { 0134 n += 7; 0135 isUrl = true; 0136 href = QString(); 0137 } else if (linkify_pmatch(out, n, QStringLiteral("http://"))) { 0138 n += 7; 0139 isUrl = true; 0140 href = QString(); 0141 } else if (linkify_pmatch(out, n, QStringLiteral("https://"))) { 0142 n += 8; 0143 isUrl = true; 0144 href = QString(); 0145 } else if (linkify_pmatch(out, n, QStringLiteral("ftp://"))) { 0146 n += 6; 0147 isUrl = true; 0148 href = QString(); 0149 } else if (linkify_pmatch(out, n, QStringLiteral("news://"))) { 0150 n += 7; 0151 isUrl = true; 0152 href = QString(); 0153 } else if (linkify_pmatch(out, n, QStringLiteral("ed2k://"))) { 0154 n += 7; 0155 isUrl = true; 0156 href = QString(); 0157 } else if (linkify_pmatch(out, n, QStringLiteral("magnet:"))) { 0158 n += 7; 0159 isUrl = true; 0160 href = QString(); 0161 } else if (linkify_pmatch(out, n, QStringLiteral("www."))) { 0162 isUrl = true; 0163 href = QStringLiteral("http://"); 0164 } else if (linkify_pmatch(out, n, QStringLiteral("ftp."))) { 0165 isUrl = true; 0166 href = QStringLiteral("ftp://"); 0167 } else if (linkify_pmatch(out, n, QStringLiteral("@"))) { 0168 isAtStyle = true; 0169 href = QStringLiteral("x-psi-atstyle:"); 0170 } 0171 0172 if (isUrl) { 0173 // make sure the previous char is not alphanumeric 0174 if (x1 > 0 && out.at(x1 - 1).isLetterOrNumber()) 0175 continue; 0176 0177 // find whitespace (or end) 0178 QMap<QChar, int> brackets; 0179 brackets[QLatin1Char('(')] = brackets[QLatin1Char(')')] = brackets[QLatin1Char('[')] = brackets[QLatin1Char(']')] = brackets[QLatin1Char('{')] = 0180 brackets[QLatin1Char('}')] = 0; 0181 QMap<QChar, QChar> openingBracket; 0182 openingBracket[QLatin1Char(')')] = QLatin1Char('('); 0183 openingBracket[QLatin1Char(']')] = QLatin1Char('['); 0184 openingBracket[QLatin1Char('}')] = QLatin1Char('{'); 0185 for (x2 = n; x2 < (int)out.length(); ++x2) { 0186 if (out.at(x2).isSpace() || linkify_isOneOf(out.at(x2), QStringLiteral("\"\'`<>")) || linkify_pmatch(out, x2, QStringLiteral(""")) 0187 || linkify_pmatch(out, x2, QStringLiteral("'")) || linkify_pmatch(out, x2, QStringLiteral(">")) 0188 || linkify_pmatch(out, x2, QStringLiteral("<"))) { 0189 break; 0190 } 0191 if (brackets.keys().contains(out.at(x2))) { 0192 ++brackets[out.at(x2)]; 0193 } 0194 } 0195 int len = x2 - x1; 0196 QString pre = resolveEntities(out.mid(x1, x2 - x1)); 0197 0198 // go backward hacking off unwanted punctuation 0199 int cutoff; 0200 for (cutoff = pre.length() - 1; cutoff >= 0; --cutoff) { 0201 if (!linkify_isOneOf(pre.at(cutoff), QStringLiteral("!?,.()[]{}<>\""))) 0202 break; 0203 if (linkify_isOneOf(pre.at(cutoff), QStringLiteral(")]}")) && brackets[pre.at(cutoff)] - brackets[openingBracket[pre.at(cutoff)]] <= 0) { 0204 break; // in theory, there could be == above, but these are urls, not math ;) 0205 } 0206 if (brackets.keys().contains(pre.at(cutoff))) { 0207 --brackets[pre.at(cutoff)]; 0208 } 0209 } 0210 ++cutoff; 0211 //++x2; 0212 0213 link = pre.mid(0, cutoff); 0214 if (!linkify_okUrl(link)) { 0215 n = x1 + link.length(); 0216 continue; 0217 } 0218 href += link; 0219 // attributes need to be encoded too. 0220 href = href.toHtmlEscaped(); 0221 href = linkify_htmlsafe(href); 0222 // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1()); 0223 linked = QStringLiteral("<a href=\"%1\">").arg(href) + QUrl{link}.toDisplayString(QUrl::RemoveQuery) + QStringLiteral("</a>") 0224 + pre.mid(cutoff).toHtmlEscaped(); 0225 out.replace(x1, len, linked); 0226 n = x1 + linked.length() - 1; 0227 } else if (isAtStyle) { 0228 // go backward till we find the beginning 0229 if (x1 == 0) 0230 continue; 0231 --x1; 0232 for (; x1 >= 0; --x1) { 0233 if (!linkify_isOneOf(out.at(x1), QStringLiteral("_.-+")) && !out.at(x1).isLetterOrNumber()) 0234 break; 0235 } 0236 ++x1; 0237 0238 // go forward till we find the end 0239 x2 = n + 1; 0240 for (; x2 < (int)out.length(); ++x2) { 0241 if (!linkify_isOneOf(out.at(x2), QStringLiteral("_.-+")) && !out.at(x2).isLetterOrNumber()) 0242 break; 0243 } 0244 0245 int len = x2 - x1; 0246 link = out.mid(x1, len); 0247 // link = resolveEntities(link); 0248 0249 if (!linkify_okEmail(link)) { 0250 n = x1 + link.length(); 0251 continue; 0252 } 0253 0254 href += link; 0255 // printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1()); 0256 linked = QStringLiteral("<a href=\"%1\">").arg(href) + link + QStringLiteral("</a>"); 0257 out.replace(x1, len, linked); 0258 n = x1 + linked.length() - 1; 0259 } 0260 } 0261 0262 return out; 0263 }