File indexing completed on 2024-10-06 07:36:10

0001 // SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com>
0002 // SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0003 
0004 #include "texthandler.h"
0005 
0006 #include <QDebug>
0007 #include <QGuiApplication>
0008 #include <QStringLiteral>
0009 #include <QUrl>
0010 
0011 #include <Quotient/events/roommessageevent.h>
0012 #include <Quotient/util.h>
0013 #include <qstringliteral.h>
0014 
0015 #include <cmark.h>
0016 
0017 #include <Kirigami/Platform/PlatformTheme>
0018 
0019 #include "models/customemojimodel.h"
0020 #include "utils.h"
0021 
0022 static const QStringList allowedTags = {
0023     QStringLiteral("font"),    QStringLiteral("del"),    QStringLiteral("h1"),         QStringLiteral("h2"),     QStringLiteral("h3"),    QStringLiteral("h4"),
0024     QStringLiteral("h5"),      QStringLiteral("h6"),     QStringLiteral("blockquote"), QStringLiteral("p"),      QStringLiteral("a"),     QStringLiteral("ul"),
0025     QStringLiteral("ol"),      QStringLiteral("sup"),    QStringLiteral("sub"),        QStringLiteral("li"),     QStringLiteral("b"),     QStringLiteral("i"),
0026     QStringLiteral("u"),       QStringLiteral("strong"), QStringLiteral("em"),         QStringLiteral("strike"), QStringLiteral("code"),  QStringLiteral("hr"),
0027     QStringLiteral("br"),      QStringLiteral("div"),    QStringLiteral("table"),      QStringLiteral("thead"),  QStringLiteral("tbody"), QStringLiteral("tr"),
0028     QStringLiteral("th"),      QStringLiteral("td"),     QStringLiteral("caption"),    QStringLiteral("pre"),    QStringLiteral("span"),  QStringLiteral("img"),
0029     QStringLiteral("details"), QStringLiteral("summary")};
0030 static const QHash<QString, QStringList> allowedAttributes = {
0031     {QStringLiteral("font"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("color")}},
0032     {QStringLiteral("span"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("data-mx-spoiler")}},
0033     {QStringLiteral("a"), {QStringLiteral("name"), QStringLiteral("target"), QStringLiteral("href")}},
0034     {QStringLiteral("img"), {QStringLiteral("width"), QStringLiteral("height"), QStringLiteral("alt"), QStringLiteral("title"), QStringLiteral("src")}},
0035     {QStringLiteral("ol"), {QStringLiteral("start")}},
0036     {QStringLiteral("code"), {QStringLiteral("class")}}};
0037 static const QStringList allowedLinkSchemes = {QStringLiteral("https"),
0038                                                QStringLiteral("http"),
0039                                                QStringLiteral("ftp"),
0040                                                QStringLiteral("mailto"),
0041                                                QStringLiteral("magnet")};
0042 
0043 QString TextHandler::data() const
0044 {
0045     return m_data;
0046 }
0047 
0048 void TextHandler::setData(const QString &string)
0049 {
0050     m_data = string;
0051     m_pos = 0;
0052 }
0053 
0054 QString TextHandler::handleSendText()
0055 {
0056     m_pos = 0;
0057     m_dataBuffer = markdownToHTML(m_data);
0058 
0059     nextTokenType();
0060 
0061     // Strip any disallowed tags/attributes.
0062     QString outputString;
0063     while (m_pos < m_dataBuffer.length()) {
0064         next();
0065 
0066         QString nextTokenBuffer = m_nextToken;
0067         switch (m_nextTokenType) {
0068         case Text:
0069             nextTokenBuffer = escapeHtml(nextTokenBuffer);
0070             nextTokenBuffer = CustomEmojiModel::instance().preprocessText(nextTokenBuffer);
0071             break;
0072         case TextCode:
0073             nextTokenBuffer = escapeHtml(nextTokenBuffer);
0074             break;
0075         case Tag:
0076             if (!isAllowedTag(getTagType())) {
0077                 nextTokenBuffer = QString();
0078             }
0079             nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer);
0080         default:
0081             break;
0082         }
0083 
0084         outputString.append(nextTokenBuffer);
0085 
0086         nextTokenType();
0087     }
0088     return outputString;
0089 }
0090 
0091 QString TextHandler::handleRecieveRichText(Qt::TextFormat inputFormat, const NeoChatRoom *room, const Quotient::RoomEvent *event, bool stripNewlines)
0092 {
0093     m_pos = 0;
0094     m_dataBuffer = m_data;
0095 
0096     // Strip mx-reply if present.
0097     m_dataBuffer.remove(TextRegex::removeRichReply);
0098 
0099     // For plain text, convert links, escape html and convert line brakes.
0100     if (inputFormat == Qt::PlainText) {
0101         m_dataBuffer = escapeHtml(m_dataBuffer);
0102         m_dataBuffer.replace(u'\n', QStringLiteral("<br>"));
0103     }
0104 
0105     // Linkify any plain text urls
0106     m_dataBuffer = linkifyUrls(m_dataBuffer);
0107 
0108     // Apply user style
0109     m_dataBuffer.replace(TextRegex::userPill, QStringLiteral(R"(<b>\1</b>)"));
0110 
0111     // Make all media URLs resolvable.
0112     if (room && event) {
0113         QRegularExpressionMatchIterator i = TextRegex::mxcImage.globalMatch(m_dataBuffer);
0114         while (i.hasNext()) {
0115             const QRegularExpressionMatch match = i.next();
0116             const QUrl mediaUrl = room->makeMediaUrl(event->id(), QUrl(QStringLiteral("mxc://") + match.captured(2) + u'/' + match.captured(3)));
0117             m_dataBuffer.replace(match.captured(0),
0118                                  QStringLiteral("<img ") + match.captured(1) + QStringLiteral("src=\"") + mediaUrl.toString() + u'"' + match.captured(4)
0119                                      + u'>');
0120         }
0121     }
0122 
0123     // Strip any disallowed tags/attributes.
0124     QString outputString;
0125     nextTokenType();
0126     while (m_pos < m_dataBuffer.length()) {
0127         next();
0128 
0129         QString nextTokenBuffer = m_nextToken;
0130         if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) {
0131             nextTokenBuffer = escapeHtml(nextTokenBuffer);
0132         } else if (m_nextTokenType == Type::Tag) {
0133             if (!isAllowedTag(getTagType())) {
0134                 nextTokenBuffer = QString();
0135             } else if ((getTagType() == QStringLiteral("br") && stripNewlines)) {
0136                 nextTokenBuffer = u' ';
0137             }
0138             nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer);
0139         }
0140 
0141         outputString.append(nextTokenBuffer);
0142 
0143         nextTokenType();
0144     }
0145 
0146     // Apply user style to blockquotes
0147     // Unfortunately some attributes can be only be used on table cells, so we need to wrap the content in one.
0148     outputString.replace(TextRegex::blockQuote, QStringLiteral(R"(<blockquote><table><tr><td>“\1”</td></tr></table></blockquote>)"));
0149 
0150     // If the message is an emote add the user pill to the front of the message.
0151     if (event != nullptr) {
0152         auto e = eventCast<const Quotient::RoomMessageEvent>(event);
0153         if (e->msgtype() == Quotient::MessageEventType::Emote) {
0154             auto author = room->user(e->senderId());
0155             QString emoteString = QStringLiteral("* <a href=\"https://matrix.to/#/") + e->senderId() + QStringLiteral("\" style=\"color:")
0156                 + Utils::getUserColor(author->hueF()).name() + QStringLiteral("\">") + author->displayname(room) + QStringLiteral("</a> ");
0157             if (outputString.startsWith(QStringLiteral("<p>"))) {
0158                 outputString.insert(3, emoteString);
0159             } else {
0160                 outputString.prepend(emoteString);
0161             }
0162         }
0163     }
0164 
0165     if (auto e = eventCast<const Quotient::RoomMessageEvent>(event)) {
0166         bool isEdited = !e->unsignedJson().isEmpty() && e->unsignedJson().contains(QStringLiteral("m.relations"))
0167             && e->unsignedJson()[QStringLiteral("m.relations")].toObject().contains(QStringLiteral("m.replace"));
0168         if (isEdited) {
0169             Kirigami::Platform::PlatformTheme *theme =
0170                 static_cast<Kirigami::Platform::PlatformTheme *>(qmlAttachedPropertiesObject<Kirigami::Platform::PlatformTheme>(this, true));
0171 
0172             QString editTextColor;
0173             if (theme != nullptr) {
0174                 editTextColor = theme->disabledTextColor().name();
0175             } else {
0176                 editTextColor = QStringLiteral("#000000");
0177             }
0178             QString editedString = QStringLiteral(" <span style=\"color:") + editTextColor + QStringLiteral("\">(edited)</span>");
0179             if (outputString.endsWith(QStringLiteral("</p>"))) {
0180                 outputString.insert(outputString.length() - 4, editedString);
0181             } else if (outputString.endsWith(QStringLiteral("</pre>")) || outputString.endsWith(QStringLiteral("</blockquote>"))
0182                        || outputString.endsWith(QStringLiteral("</table>")) || outputString.endsWith(QStringLiteral("</ol>"))
0183                        || outputString.endsWith(QStringLiteral("</ul>"))) {
0184                 outputString.append(QStringLiteral("<p>%1</p>").arg(editedString));
0185             } else {
0186                 outputString.append(editedString);
0187             }
0188         }
0189     }
0190 
0191     /**
0192      * Replace <del> with <s>
0193      * Note: <s> is still not a valid tag for the message from the server. We
0194      * convert as that is what is needed for Qt::RichText.
0195      */
0196     outputString.replace(TextRegex::strikethrough, QStringLiteral("<s>\\1</s>"));
0197     return outputString;
0198 }
0199 
0200 QString TextHandler::handleRecievePlainText(Qt::TextFormat inputFormat, const bool &stripNewlines)
0201 {
0202     m_pos = 0;
0203     m_dataBuffer = m_data;
0204 
0205     // Strip mx-reply if present.
0206     m_dataBuffer.remove(TextRegex::removeRichReply);
0207 
0208     // Escaping then unescaping allows < and > to be maintained in a plain text string
0209     // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely.
0210     if (inputFormat == Qt::PlainText) {
0211         m_dataBuffer = escapeHtml(m_dataBuffer);
0212     }
0213 
0214     /**
0215      * This seems counterproductive but by converting any markup which could
0216      * arrive (e.g. in a caption body) it can then be stripped by the same code.
0217      */
0218     m_dataBuffer = markdownToHTML(m_dataBuffer);
0219     // This is how \n is converted and for plain text we need it to just be <br>
0220     // to prevent extra newlines being inserted.
0221     m_dataBuffer.replace(QStringLiteral("<br />\n"), QStringLiteral("<br>"));
0222 
0223     if (stripNewlines) {
0224         m_dataBuffer.replace(QStringLiteral("<br>\n"), QStringLiteral(" "));
0225         m_dataBuffer.replace(QStringLiteral("<br>"), QStringLiteral(" "));
0226         m_dataBuffer.replace(QStringLiteral("<br />\n"), QStringLiteral(" "));
0227         m_dataBuffer.replace(QStringLiteral("<br />"), QStringLiteral(" "));
0228         m_dataBuffer.replace(u'\n', QStringLiteral(" "));
0229         m_dataBuffer.replace(u'\u2028', QStringLiteral(" "));
0230     }
0231 
0232     // Strip all tags/attributes except code blocks which will be escaped.
0233     QString outputString;
0234     nextTokenType();
0235     while (m_pos < m_dataBuffer.length()) {
0236         next();
0237 
0238         QString nextTokenBuffer = m_nextToken;
0239         if (m_nextTokenType == Type::TextCode) {
0240             nextTokenBuffer = unescapeHtml(nextTokenBuffer);
0241         } else if (m_nextTokenType == Type::Tag) {
0242             if (getTagType() == QStringLiteral("br") && !stripNewlines) {
0243                 nextTokenBuffer = u'\n';
0244             } else {
0245                 nextTokenBuffer = QString();
0246             }
0247         }
0248 
0249         outputString.append(nextTokenBuffer);
0250 
0251         nextTokenType();
0252     }
0253 
0254     // Escaping then unescaping allows < and > to be maintained in a plain text string
0255     // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely.
0256     outputString = unescapeHtml(outputString);
0257 
0258     outputString = outputString.trimmed();
0259     return outputString;
0260 }
0261 
0262 void TextHandler::next()
0263 {
0264     QString searchStr;
0265     if (m_nextTokenType == Type::Tag) {
0266         searchStr = u'>';
0267     } else if (m_nextTokenType == Type::TextCode) {
0268         // Anything between code tags is assumed to be plain text
0269         searchStr = QStringLiteral("</code>");
0270     } else {
0271         searchStr = u'<';
0272     }
0273 
0274     int tokenEnd = m_dataBuffer.indexOf(searchStr, m_pos + 1);
0275     if (tokenEnd == -1) {
0276         tokenEnd = m_dataBuffer.length();
0277     }
0278 
0279     m_nextToken = m_dataBuffer.mid(m_pos, tokenEnd - m_pos + (m_nextTokenType == Type::Tag ? 1 : 0));
0280     m_pos = tokenEnd + (m_nextTokenType == Type::Tag ? 1 : 0);
0281 }
0282 
0283 void TextHandler::nextTokenType()
0284 {
0285     if (m_pos >= m_dataBuffer.length()) {
0286         // This is to stop the function accessing an index outside the length of
0287         // m_dataBuffer during the final loop.
0288         m_nextTokenType = Type::End;
0289     } else if (m_nextTokenType == Type::Tag && getTagType() == QStringLiteral("code") && !isCloseTag()
0290                && m_dataBuffer.indexOf(QStringLiteral("</code>"), m_pos) != m_pos) {
0291         m_nextTokenType = Type::TextCode;
0292     } else if (m_dataBuffer[m_pos] == u'<' && m_dataBuffer[m_pos + 1] != u' ') {
0293         m_nextTokenType = Type::Tag;
0294     } else {
0295         m_nextTokenType = Type::Text;
0296     }
0297 }
0298 
0299 QString TextHandler::getTagType() const
0300 {
0301     if (m_nextToken.isEmpty()) {
0302         return QString();
0303     }
0304     const int tagTypeStart = m_nextToken[1] == u'/' ? 2 : 1;
0305     const int tagTypeEnd = m_nextToken.indexOf(TextRegex::endTagType, tagTypeStart);
0306     return m_nextToken.mid(tagTypeStart, tagTypeEnd - tagTypeStart);
0307 }
0308 
0309 bool TextHandler::isCloseTag() const
0310 {
0311     if (m_nextToken.isEmpty()) {
0312         return false;
0313     }
0314     return m_nextToken[1] == u'/';
0315 }
0316 
0317 QString TextHandler::getAttributeType(const QString &string)
0318 {
0319     if (!string.contains(u'=')) {
0320         return string;
0321     }
0322     const int equalsPos = string.indexOf(u'=');
0323     return string.left(equalsPos);
0324 }
0325 
0326 QString TextHandler::getAttributeData(const QString &string)
0327 {
0328     if (!string.contains(u'=')) {
0329         return QStringLiteral();
0330     }
0331     const int equalsPos = string.indexOf(u'=');
0332     return string.right(string.length() - equalsPos - 1);
0333 }
0334 
0335 bool TextHandler::isAllowedTag(const QString &type)
0336 {
0337     return allowedTags.contains(type);
0338 }
0339 
0340 bool TextHandler::isAllowedAttribute(const QString &tag, const QString &attribute)
0341 {
0342     return allowedAttributes[tag].contains(attribute);
0343 }
0344 
0345 bool TextHandler::isAllowedLink(const QString &link, bool isImg)
0346 {
0347     const QUrl linkUrl = QUrl(link);
0348 
0349     if (isImg) {
0350         return !linkUrl.isRelative() && linkUrl.scheme() == QStringLiteral("mxc");
0351     } else {
0352         return !linkUrl.isRelative() && allowedLinkSchemes.contains(linkUrl.scheme());
0353     }
0354 }
0355 
0356 QString TextHandler::cleanAttributes(const QString &tag, const QString &tagString)
0357 {
0358     int nextAttributeIndex = tagString.indexOf(u' ', 1);
0359 
0360     if (nextAttributeIndex != -1) {
0361         QString outputString = tagString.left(nextAttributeIndex);
0362         QString nextAttribute;
0363         int nextSpaceIndex;
0364         nextAttributeIndex += 1;
0365 
0366         while (nextAttributeIndex < tagString.length()) {
0367             nextSpaceIndex = tagString.indexOf(TextRegex::endTagType, nextAttributeIndex);
0368             if (nextSpaceIndex == -1) {
0369                 nextSpaceIndex = tagString.length();
0370             }
0371             nextAttribute = tagString.mid(nextAttributeIndex, nextSpaceIndex - nextAttributeIndex);
0372 
0373             if (isAllowedAttribute(tag, getAttributeType(nextAttribute))) {
0374                 if (tag == QStringLiteral("img") && getAttributeType(nextAttribute) == QStringLiteral("src")) {
0375                     QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1);
0376                     if (isAllowedLink(attributeData, true)) {
0377                         outputString.append(u' ' + nextAttribute);
0378                     }
0379                 } else if (tag == u'a' && getAttributeType(nextAttribute) == QStringLiteral("href")) {
0380                     QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1);
0381                     if (isAllowedLink(attributeData)) {
0382                         outputString.append(u' ' + nextAttribute);
0383                     }
0384                 } else if (tag == QStringLiteral("code") && getAttributeType(nextAttribute) == QStringLiteral("class")) {
0385                     if (getAttributeData(nextAttribute).remove(u'"').startsWith(QStringLiteral("language-"))) {
0386                         outputString.append(u' ' + nextAttribute);
0387                     }
0388                 } else {
0389                     outputString.append(u' ' + nextAttribute);
0390                 }
0391             }
0392             nextAttributeIndex = nextSpaceIndex + 1;
0393         }
0394 
0395         outputString += u'>';
0396         return outputString;
0397     }
0398 
0399     return tagString;
0400 }
0401 
0402 QString TextHandler::markdownToHTML(const QString &markdown)
0403 {
0404     const auto str = markdown.toUtf8();
0405     char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE);
0406 
0407     const std::string html(tmp_buf);
0408 
0409     free(tmp_buf);
0410 
0411     auto result = QString::fromStdString(html).trimmed();
0412 
0413     result.replace(QStringLiteral("<!-- raw HTML omitted -->"), QString());
0414 
0415     return result;
0416 }
0417 
0418 /**
0419  * TODO: make this more intelligent currently other characters are not escaped
0420  * especially & as this can conflict with the cmark markdown to html conversion
0421  * which already escapes characters in code blocks. The < > still need to be handled
0422  * when the user manually types in the html.
0423  */
0424 QString TextHandler::escapeHtml(QString stringIn)
0425 {
0426     stringIn.replace(u'<', QStringLiteral("&lt;"));
0427     stringIn.replace(u'>', QStringLiteral("&gt;"));
0428     return stringIn;
0429 }
0430 
0431 QString TextHandler::unescapeHtml(QString stringIn)
0432 {
0433     // For those situations where brackets in code block get double escaped
0434     stringIn.replace(QStringLiteral("&amp;lt;"), QStringLiteral("<"));
0435     stringIn.replace(QStringLiteral("&amp;gt;"), QStringLiteral(">"));
0436     stringIn.replace(QStringLiteral("&lt;"), QStringLiteral("<"));
0437     stringIn.replace(QStringLiteral("&gt;"), QStringLiteral(">"));
0438     stringIn.replace(QStringLiteral("&amp;"), QStringLiteral("&"));
0439     stringIn.replace(QStringLiteral("&quot;"), QStringLiteral("\""));
0440     return stringIn;
0441 }
0442 
0443 QString TextHandler::linkifyUrls(QString stringIn)
0444 {
0445     QRegularExpressionMatch match;
0446     int start = 0;
0447     for (int index = 0; index != -1; index = stringIn.indexOf(TextRegex::mxId, start, &match)) {
0448         int skip = 0;
0449         if (match.captured(0).size() > 0) {
0450             if (stringIn.left(index).count(QStringLiteral("<code>")) == stringIn.left(index).count(QStringLiteral("</code>"))) {
0451                 auto replacement = QStringLiteral("<a href=\"https://matrix.to/#/%1\">%1</a>").arg(match.captured(2));
0452                 stringIn = stringIn.replace(index, match.captured(0).size(), replacement);
0453             } else {
0454                 skip = match.captured().length();
0455             }
0456         }
0457         start = index + skip;
0458         match = {};
0459     }
0460     start = 0;
0461     match = {};
0462     for (int index = 0; index != -1; index = stringIn.indexOf(TextRegex::plainUrl, start, &match)) {
0463         int skip = 0;
0464         if (match.captured(0).size() > 0) {
0465             if (stringIn.left(index).count(QStringLiteral("<code>")) == stringIn.left(index).count(QStringLiteral("</code>"))) {
0466                 auto replacement = QStringLiteral("<a href=\"%1\">%1</a>").arg(match.captured(1));
0467                 stringIn = stringIn.replace(index, match.captured(0).size(), replacement);
0468                 skip = replacement.length();
0469             } else {
0470                 skip = match.captured().length();
0471             }
0472         }
0473         start = index + skip;
0474         match = {};
0475     }
0476     start = 0;
0477     match = {};
0478     for (int index = 0; index != -1; index = stringIn.indexOf(TextRegex::emailAddress, start, &match)) {
0479         int skip = 0;
0480         if (match.captured(0).size() > 0) {
0481             if (stringIn.left(index).count(QStringLiteral("<code>")) == stringIn.left(index).count(QStringLiteral("</code>"))) {
0482                 auto replacement = QStringLiteral("<a href=\"mailto:%1\">%1</a>").arg(match.captured(2));
0483                 stringIn = stringIn.replace(index, match.captured(0).size(), replacement);
0484                 skip = replacement.length();
0485             } else {
0486                 skip = match.captured().length();
0487             }
0488         }
0489         start = index + skip;
0490         match = {};
0491     }
0492 
0493     return stringIn;
0494 }
0495 
0496 #include "moc_texthandler.cpp"