File indexing completed on 2025-01-19 04:51:56

0001 /*
0002     Copyright (c) 2017 Christian Mollekopf <mollekopf@kolabsys.com>
0003 
0004     This library is free software; you can redistribute it and/or modify it
0005     under the terms of the GNU Library General Public License as published by
0006     the Free Software Foundation; either version 2 of the License, or (at your
0007     option) any later version.
0008 
0009     This library is distributed in the hope that it will be useful, but WITHOUT
0010     ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
0011     FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
0012     License for more details.
0013 
0014     You should have received a copy of the GNU Library General Public License
0015     along with this library; see the file COPYING.LIB.  If not, write to the
0016     Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
0017     02110-1301, USA.
0018 */
0019 #include "htmlutils.h"
0020 
0021 #include <QUrl>
0022 #include <QMap>
0023 
0024 static QString resolveEntities(const QString &in)
0025 {
0026     QString out;
0027 
0028     for(int i = 0; i < (int)in.length(); ++i) {
0029         if(in[i] == '&') {
0030             // find a semicolon
0031             ++i;
0032             int n = in.indexOf(';', i);
0033             if(n == -1)
0034                 break;
0035             QString type = in.mid(i, (n-i));
0036             i = n; // should be n+1, but we'll let the loop increment do it
0037 
0038             if(type == "amp")
0039                 out += '&';
0040             else if(type == "lt")
0041                 out += '<';
0042             else if(type == "gt")
0043                 out += '>';
0044             else if(type == "quot")
0045                 out += '\"';
0046             else if(type == "apos")
0047                 out += '\'';
0048             else if(type == "nbsp")
0049                 out += 0xa0;
0050         } else {
0051             out += in[i];
0052         }
0053     }
0054 
0055     return out;
0056 }
0057 
0058 
0059 static bool linkify_pmatch(const QString &str1, int at, const QString &str2)
0060 {
0061     if(str2.length() > (str1.length()-at))
0062         return false;
0063 
0064     for(int n = 0; n < (int)str2.length(); ++n) {
0065         if(str1.at(n+at).toLower() != str2.at(n).toLower())
0066             return false;
0067     }
0068 
0069     return true;
0070 }
0071 
0072 static bool linkify_isOneOf(const QChar &c, const QString &charlist)
0073 {
0074     for(int i = 0; i < (int)charlist.length(); ++i) {
0075         if(c == charlist.at(i))
0076             return true;
0077     }
0078 
0079     return false;
0080 }
0081 
0082 // encodes a few dangerous html characters
0083 static QString linkify_htmlsafe(const QString &in)
0084 {
0085     QString out;
0086 
0087     for(int n = 0; n < in.length(); ++n) {
0088         if(linkify_isOneOf(in.at(n), "\"\'`<>")) {
0089             // hex encode
0090             QString hex;
0091             hex.asprintf("%%%02X", in.at(n).toLatin1());
0092             out.append(hex);
0093         } else {
0094             out.append(in.at(n));
0095         }
0096     }
0097 
0098     return out;
0099 }
0100 
0101 static bool linkify_okUrl(const QString &url)
0102 {
0103     if(url.at(url.length()-1) == '.')
0104         return false;
0105 
0106     return true;
0107 }
0108 
0109 static bool linkify_okEmail(const QString &addy)
0110 {
0111     // this makes sure that there is an '@' and a '.' after it, and that there is
0112     // at least one char for each of the three sections
0113     int n = addy.indexOf('@');
0114     if(n == -1 || n == 0)
0115         return false;
0116     int d = addy.indexOf('.', n+1);
0117     if(d == -1 || d == 0)
0118         return false;
0119     if((addy.length()-1) - d <= 0)
0120         return false;
0121     if(addy.indexOf("..") != -1)
0122         return false;
0123 
0124     return true;
0125 }
0126 
0127 /**
0128  * takes a richtext string and heuristically adds links for uris of common protocols
0129  * @return a richtext string with link markup added
0130  */
0131 QString HtmlUtils::linkify(const QString &in)
0132 {
0133     QString out = in;
0134     int x1, x2;
0135     bool isUrl, isAtStyle;
0136     QString linked, link, href;
0137 
0138     for(int n = 0; n < (int)out.length(); ++n) {
0139         isUrl = false;
0140         isAtStyle = false;
0141         x1 = n;
0142 
0143         if(linkify_pmatch(out, n, "xmpp:")) {
0144             n += 5;
0145             isUrl = true;
0146             href = "";
0147         }
0148         else if(linkify_pmatch(out, n, "mailto:")) {
0149             n += 7;
0150             isUrl = true;
0151             href = "";
0152         }
0153         else if(linkify_pmatch(out, n, "http://")) {
0154             n += 7;
0155             isUrl = true;
0156             href = "";
0157         }
0158         else if(linkify_pmatch(out, n, "https://")) {
0159             n += 8;
0160             isUrl = true;
0161             href = "";
0162         }
0163         else if(linkify_pmatch(out, n, "ftp://")) {
0164             n += 6;
0165             isUrl = true;
0166             href = "";
0167         }
0168         else if(linkify_pmatch(out, n, "news://")) {
0169             n += 7;
0170             isUrl = true;
0171             href = "";
0172         }
0173         else if (linkify_pmatch(out, n, "ed2k://")) {
0174             n += 7;
0175             isUrl = true;
0176             href = "";
0177         }
0178         else if (linkify_pmatch(out, n, "magnet:")) {
0179             n += 7;
0180             isUrl = true;
0181             href = "";
0182         }
0183         else if(linkify_pmatch(out, n, "www.")) {
0184             isUrl = true;
0185             href = "http://";
0186         }
0187         else if(linkify_pmatch(out, n, "ftp.")) {
0188             isUrl = true;
0189             href = "ftp://";
0190         }
0191         else if(linkify_pmatch(out, n, "@")) {
0192             isAtStyle = true;
0193             href = "x-psi-atstyle:";
0194         }
0195 
0196         if(isUrl) {
0197             // make sure the previous char is not alphanumeric
0198             if(x1 > 0 && out.at(x1-1).isLetterOrNumber())
0199                 continue;
0200 
0201             // find whitespace (or end)
0202             QMap<QChar, int> brackets;
0203             brackets['('] = brackets[')'] = brackets['['] = brackets[']'] = brackets['{'] = brackets['}'] = 0;
0204             QMap<QChar, QChar> openingBracket;
0205             openingBracket[')'] = '(';
0206             openingBracket[']'] = '[';
0207             openingBracket['}'] = '{';
0208             for(x2 = n; x2 < (int)out.length(); ++x2) {
0209                 if(out.at(x2).isSpace() || linkify_isOneOf(out.at(x2), "\"\'`<>")
0210                     || linkify_pmatch(out, x2, "&quot;")  || linkify_pmatch(out, x2, "&apos;")
0211                     || linkify_pmatch(out, x2, "&gt;") || linkify_pmatch(out, x2, "&lt;") ) {
0212                     break;
0213                 }
0214                 if(brackets.keys().contains(out.at(x2))) {
0215                     ++brackets[out.at(x2)];
0216                 }
0217             }
0218             int len = x2-x1;
0219             QString pre = resolveEntities(out.mid(x1, x2-x1));
0220 
0221             // go backward hacking off unwanted punctuation
0222             int cutoff;
0223             for(cutoff = pre.length()-1; cutoff >= 0; --cutoff) {
0224                 if(!linkify_isOneOf(pre.at(cutoff), "!?,.()[]{}<>\""))
0225                     break;
0226                 if(linkify_isOneOf(pre.at(cutoff), ")]}")
0227                     && brackets[pre.at(cutoff)] - brackets[openingBracket[pre.at(cutoff)]] <= 0 ) {
0228                     break;  // in theory, there could be == above, but these are urls, not math ;)
0229                 }
0230                 if(brackets.keys().contains(pre.at(cutoff))) {
0231                     --brackets[pre.at(cutoff)];
0232                 }
0233 
0234             }
0235             ++cutoff;
0236             //++x2;
0237 
0238             link = pre.mid(0, cutoff);
0239             if(!linkify_okUrl(link)) {
0240                 n = x1 + link.length();
0241                 continue;
0242             }
0243             href += link;
0244             // attributes need to be encoded too.
0245             href = href.toHtmlEscaped();
0246             href = linkify_htmlsafe(href);
0247             //printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1());
0248             linked = QString("<a href=\"%1\">").arg(href) + QUrl{link}.toDisplayString(QUrl::RemoveQuery) + "</a>" + pre.mid(cutoff).toHtmlEscaped();
0249             out.replace(x1, len, linked);
0250             n = x1 + linked.length() - 1;
0251         } else if(isAtStyle) {
0252             // go backward till we find the beginning
0253             if(x1 == 0)
0254                 continue;
0255             --x1;
0256             for(; x1 >= 0; --x1) {
0257                 if(!linkify_isOneOf(out.at(x1), "_.-+") && !out.at(x1).isLetterOrNumber())
0258                     break;
0259             }
0260             ++x1;
0261 
0262             // go forward till we find the end
0263             x2 = n + 1;
0264             for(; x2 < (int)out.length(); ++x2) {
0265                 if(!linkify_isOneOf(out.at(x2), "_.-+") && !out.at(x2).isLetterOrNumber())
0266                     break;
0267             }
0268 
0269             int len = x2-x1;
0270             link = out.mid(x1, len);
0271             //link = resolveEntities(link);
0272 
0273             if(!linkify_okEmail(link)) {
0274                 n = x1 + link.length();
0275                 continue;
0276             }
0277 
0278             href += link;
0279             //printf("link: [%s], href=[%s]\n", link.latin1(), href.latin1());
0280             linked = QString("<a href=\"%1\">").arg(href) + link + "</a>";
0281             out.replace(x1, len, linked);
0282             n = x1 + linked.length() - 1;
0283         }
0284     }
0285 
0286     return out;
0287 }