File indexing completed on 2025-01-19 04:51:56
0001 /* 0002 Copyright (c) 2017 Christian Mollekopf <mollekopf@kolabsys.com> 0003 0004 This library is free software; you can redistribute it and/or modify it 0005 under the terms of the GNU Library General Public License as published by 0006 the Free Software Foundation; either version 2 of the License, or (at your 0007 option) any later version. 0008 0009 This library is distributed in the hope that it will be useful, but WITHOUT 0010 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 0011 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public 0012 License for more details. 0013 0014 You should have received a copy of the GNU Library General Public License 0015 along with this library; see the file COPYING.LIB. If not, write to the 0016 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 0017 02110-1301, USA. 0018 */ 0019 #include "htmlutils.h" 0020 0021 #include <QUrl> 0022 #include <QMap> 0023 0024 static QString resolveEntities(const QString &in) 0025 { 0026 QString out; 0027 0028 for(int i = 0; i < (int)in.length(); ++i) { 0029 if(in[i] == '&') { 0030 // find a semicolon 0031 ++i; 0032 int n = in.indexOf(';', i); 0033 if(n == -1) 0034 break; 0035 QString type = in.mid(i, (n-i)); 0036 i = n; // should be n+1, but we'll let the loop increment do it 0037 0038 if(type == "amp") 0039 out += '&'; 0040 else if(type == "lt") 0041 out += '<'; 0042 else if(type == "gt") 0043 out += '>'; 0044 else if(type == "quot") 0045 out += '\"'; 0046 else if(type == "apos") 0047 out += '\''; 0048 else if(type == "nbsp") 0049 out += 0xa0; 0050 } else { 0051 out += in[i]; 0052 } 0053 } 0054 0055 return out; 0056 } 0057 0058 0059 static bool linkify_pmatch(const QString &str1, int at, const QString &str2) 0060 { 0061 if(str2.length() > (str1.length()-at)) 0062 return false; 0063 0064 for(int n = 0; n < (int)str2.length(); ++n) { 0065 if(str1.at(n+at).toLower() != str2.at(n).toLower()) 0066 return false; 0067 } 0068 0069 return true; 0070 } 0071 0072 static bool linkify_isOneOf(const QChar &c, const QString &charlist) 0073 { 0074 for(int i = 0; i < (int)charlist.length(); ++i) { 0075 if(c == charlist.at(i)) 0076 return true; 0077 } 0078 0079 return false; 0080 } 0081 0082 // encodes a few dangerous html characters 0083 static QString linkify_htmlsafe(const QString &in) 0084 { 0085 QString out; 0086 0087 for(int n = 0; n < in.length(); ++n) { 0088 if(linkify_isOneOf(in.at(n), "\"\'`<>")) { 0089 // hex encode 0090 QString hex; 0091 hex.asprintf("%%%02X", in.at(n).toLatin1()); 0092 out.append(hex); 0093 } else { 0094 out.append(in.at(n)); 0095 } 0096 } 0097 0098 return out; 0099 } 0100 0101 static bool linkify_okUrl(const QString &url) 0102 { 0103 if(url.at(url.length()-1) == '.') 0104 return false; 0105 0106 return true; 0107 } 0108 0109 static bool linkify_okEmail(const QString &addy) 0110 { 0111 // this makes sure that there is an '@' and a '.' after it, and that there is 0112 // at least one char for each of the three sections 0113 int n = addy.indexOf('@'); 0114 if(n == -1 || n == 0) 0115 return false; 0116 int d = addy.indexOf('.', n+1); 0117 if(d == -1 || d == 0) 0118 return false; 0119 if((addy.length()-1) - d <= 0) 0120 return false; 0121 if(addy.indexOf("..") != -1) 0122 return false; 0123 0124 return true; 0125 } 0126 0127 /** 0128 * takes a richtext string and heuristically adds links for uris of common protocols 0129 * @return a richtext string with link markup added 0130 */ 0131 QString HtmlUtils::linkify(const QString &in) 0132 { 0133 QString out = in; 0134 int x1, x2; 0135 bool isUrl, isAtStyle; 0136 QString linked, link, href; 0137 0138 for(int n = 0; n < (int)out.length(); ++n) { 0139 isUrl = false; 0140 isAtStyle = false; 0141 x1 = n; 0142 0143 if(linkify_pmatch(out, n, "xmpp:")) { 0144 n += 5; 0145 isUrl = true; 0146 href = ""; 0147 } 0148 else if(linkify_pmatch(out, n, "mailto:")) { 0149 n += 7; 0150 isUrl = true; 0151 href = ""; 0152 } 0153 else if(linkify_pmatch(out, n, "http://")) { 0154 n += 7; 0155 isUrl = true; 0156 href = ""; 0157 } 0158 else if(linkify_pmatch(out, n, "https://")) { 0159 n += 8; 0160 isUrl = true; 0161 href = ""; 0162 } 0163 else if(linkify_pmatch(out, n, "ftp://")) { 0164 n += 6; 0165 isUrl = true; 0166 href = ""; 0167 } 0168 else if(linkify_pmatch(out, n, "news://")) { 0169 n += 7; 0170 isUrl = true; 0171 href = ""; 0172 } 0173 else if (linkify_pmatch(out, n, "ed2k://")) { 0174 n += 7; 0175 isUrl = true; 0176 href = ""; 0177 } 0178 else if (linkify_pmatch(out, n, "magnet:")) { 0179 n += 7; 0180 isUrl = true; 0181 href = ""; 0182 } 0183 else if(linkify_pmatch(out, n, "www.")) { 0184 isUrl = true; 0185 href = "http://"; 0186 } 0187 else if(linkify_pmatch(out, n, "ftp.")) { 0188 isUrl = true; 0189 href = "ftp://"; 0190 } 0191 else if(linkify_pmatch(out, n, "@")) { 0192 isAtStyle = true; 0193 href = "x-psi-atstyle:"; 0194 } 0195 0196 if(isUrl) { 0197 // make sure the previous char is not alphanumeric 0198 if(x1 > 0 && out.at(x1-1).isLetterOrNumber()) 0199 continue; 0200 0201 // find whitespace (or end) 0202 QMap<QChar, int> brackets; 0203 brackets['('] = brackets[')'] = brackets['['] = brackets[']'] = brackets['{'] = brackets['}'] = 0; 0204 QMap<QChar, QChar> openingBracket; 0205 openingBracket[')'] = '('; 0206 openingBracket[']'] = '['; 0207 openingBracket['}'] = '{'; 0208 for(x2 = n; x2 < (int)out.length(); ++x2) { 0209 if(out.at(x2).isSpace() || linkify_isOneOf(out.at(x2), "\"\'`<>") 0210 || linkify_pmatch(out, x2, """) || linkify_pmatch(out, x2, "'") 0211 || linkify_pmatch(out, x2, ">") || linkify_pmatch(out, x2, "<") ) { 0212 break; 0213 } 0214 if(brackets.keys().contains(out.at(x2))) { 0215 ++brackets[out.at(x2)]; 0216 } 0217 } 0218 int len = x2-x1; 0219 QString pre = resolveEntities(out.mid(x1, x2-x1)); 0220 0221 // go backward hacking off unwanted punctuation 0222 int cutoff; 0223 for(cutoff = pre.length()-1; cutoff >= 0; --cutoff) { 0224 if(!linkify_isOneOf(pre.at(cutoff), "!?,.()[]{}<>\"")) 0225 break; 0226 if(linkify_isOneOf(pre.at(cutoff), ")]}") 0227 && brackets[pre.at(cutoff)] - brackets[openingBracket[pre.at(cutoff)]] <= 0 ) { 0228 break; // in theory, there could be == above, but these are urls, not math ;) 0229 } 0230 if(brackets.keys().contains(pre.at(cutoff))) { 0231 --brackets[pre.at(cutoff)]; 0232 } 0233 0234 } 0235 ++cutoff; 0236 //++x2; 0237 0238 link = pre.mid(0, cutoff); 0239 if(!linkify_okUrl(link)) { 0240 n = x1 + link.length(); 0241 continue; 0242 } 0243 href += link; 0244 // attributes need to be encoded too. 0245 href = href.toHtmlEscaped(); 0246 href = linkify_htmlsafe(href); 0247 //printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1()); 0248 linked = QString("<a href=\"%1\">").arg(href) + QUrl{link}.toDisplayString(QUrl::RemoveQuery) + "</a>" + pre.mid(cutoff).toHtmlEscaped(); 0249 out.replace(x1, len, linked); 0250 n = x1 + linked.length() - 1; 0251 } else if(isAtStyle) { 0252 // go backward till we find the beginning 0253 if(x1 == 0) 0254 continue; 0255 --x1; 0256 for(; x1 >= 0; --x1) { 0257 if(!linkify_isOneOf(out.at(x1), "_.-+") && !out.at(x1).isLetterOrNumber()) 0258 break; 0259 } 0260 ++x1; 0261 0262 // go forward till we find the end 0263 x2 = n + 1; 0264 for(; x2 < (int)out.length(); ++x2) { 0265 if(!linkify_isOneOf(out.at(x2), "_.-+") && !out.at(x2).isLetterOrNumber()) 0266 break; 0267 } 0268 0269 int len = x2-x1; 0270 link = out.mid(x1, len); 0271 //link = resolveEntities(link); 0272 0273 if(!linkify_okEmail(link)) { 0274 n = x1 + link.length(); 0275 continue; 0276 } 0277 0278 href += link; 0279 //printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1()); 0280 linked = QString("<a href=\"%1\">").arg(href) + link + "</a>"; 0281 out.replace(x1, len, linked); 0282 n = x1 + linked.length() - 1; 0283 } 0284 } 0285 0286 return out; 0287 }