File indexing completed on 2024-10-06 10:23:57

0001 // SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com>
0002 // SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0003 
0004 #include "texthandler.h"
0005 
0006 #include <QDebug>
0007 #include <QUrl>
0008 #include <QStringLiteral>
0009 
0010 #include <Quotient/events/roommessageevent.h>
0011 #include <Quotient/util.h>
0012 #include <qstringliteral.h>
0013 
0014 #include <cmark.h>
0015 
0016 #include <Kirigami/PlatformTheme>
0017 
0018 static const QStringList allowedTags = {
0019     QStringLiteral("font"),    QStringLiteral("del"),    QStringLiteral("h1"),         QStringLiteral("h2"),     QStringLiteral("h3"),    QStringLiteral("h4"),
0020     QStringLiteral("h5"),      QStringLiteral("h6"),     QStringLiteral("blockquote"), QStringLiteral("p"),      QStringLiteral("a"),     QStringLiteral("ul"),
0021     QStringLiteral("ol"),      QStringLiteral("sup"),    QStringLiteral("sub"),        QStringLiteral("li"),     QStringLiteral("b"),     QStringLiteral("i"),
0022     QStringLiteral("u"),       QStringLiteral("strong"), QStringLiteral("em"),         QStringLiteral("strike"), QStringLiteral("code"),  QStringLiteral("hr"),
0023     QStringLiteral("br"),      QStringLiteral("div"),    QStringLiteral("table"),      QStringLiteral("thead"),  QStringLiteral("tbody"), QStringLiteral("tr"),
0024     QStringLiteral("th"),      QStringLiteral("td"),     QStringLiteral("caption"),    QStringLiteral("pre"),    QStringLiteral("span"),  QStringLiteral("img"),
0025     QStringLiteral("details"), QStringLiteral("summary")};
0026 static const QHash<QString, QStringList> allowedAttributes = {
0027     {QStringLiteral("font"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("color")}},
0028     {QStringLiteral("span"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("data-mx-spoiler")}},
0029     {QStringLiteral("a"), {QStringLiteral("name"), QStringLiteral("target"), QStringLiteral("href")}},
0030     {QStringLiteral("img"), {QStringLiteral("width"), QStringLiteral("height"), QStringLiteral("alt"), QStringLiteral("title"), QStringLiteral("src")}},
0031     {QStringLiteral("ol"), {QStringLiteral("start")}},
0032     {QStringLiteral("code"), {QStringLiteral("class")}}};
0033 static const QStringList allowedLinkSchemes = {QStringLiteral("https"),
0034                                                QStringLiteral("http"),
0035                                                QStringLiteral("ftp"),
0036                                                QStringLiteral("mailto"),
0037                                                QStringLiteral("magnet")};
0038 
0039 QString TextHandler::data() const
0040 {
0041     return m_data;
0042 }
0043 
0044 void TextHandler::setData(const QString &string)
0045 {
0046     m_data = string;
0047     m_pos = 0;
0048 }
0049 
0050 QString TextHandler::handleSendText()
0051 {
0052     m_pos = 0;
0053     m_dataBuffer = markdownToHTML(m_data);
0054 
0055     nextTokenType();
0056 
0057     // Strip any disallowed tags/attributes.
0058     QString outputString;
0059     while (m_pos < m_dataBuffer.length()) {
0060         next();
0061 
0062         QString nextTokenBuffer = m_nextToken;
0063         if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) {
0064             nextTokenBuffer = escapeHtml(nextTokenBuffer);
0065         } else if (m_nextTokenType == Type::Tag) {
0066             if (!isAllowedTag(getTagType())) {
0067                 nextTokenBuffer = QString();
0068             }
0069             nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer);
0070         }
0071 
0072         outputString.append(nextTokenBuffer);
0073 
0074         nextTokenType();
0075     }
0076     return outputString;
0077 }
0078 
0079 QString TextHandler::handleRecieveRichText(Qt::TextFormat inputFormat, const NeoChatRoom *room, const Quotient::RoomEvent *event, bool stripNewlines)
0080 {
0081     m_pos = 0;
0082     m_dataBuffer = m_data;
0083 
0084     // Strip mx-reply if present.
0085     m_dataBuffer.remove(TextRegex::removeRichReply);
0086 
0087     // For plain text, convert links, escape html and convert line brakes.
0088     if (inputFormat == Qt::PlainText) {
0089         m_dataBuffer = escapeHtml(m_dataBuffer);
0090         m_dataBuffer.replace(u'\n', QStringLiteral("<br>"));
0091     }
0092 
0093     // Linkify any plain text urls
0094     m_dataBuffer = linkifyUrls(m_dataBuffer);
0095 
0096     // Apply user style
0097     m_dataBuffer.replace(TextRegex::userPill, QStringLiteral(R"(<b>\1</b>)"));
0098 
0099     // Make all media URLs resolvable.
0100     if (room && event) {
0101         QRegularExpressionMatchIterator i = TextRegex::mxcImage.globalMatch(m_dataBuffer);
0102         while (i.hasNext()) {
0103             const QRegularExpressionMatch match = i.next();
0104             const QUrl mediaUrl = room->makeMediaUrl(event->id(), QUrl(QStringLiteral("mxc://") + match.captured(2) + u'/' + match.captured(3)));
0105             m_dataBuffer.replace(match.captured(0),
0106                                  QStringLiteral("<img ") + match.captured(1) + QStringLiteral("src=\"") + mediaUrl.toString() + u'"' + match.captured(4)
0107                                      + u'>');
0108         }
0109     }
0110 
0111     // Strip any disallowed tags/attributes.
0112     QString outputString;
0113     nextTokenType();
0114     while (m_pos < m_dataBuffer.length()) {
0115         next();
0116 
0117         QString nextTokenBuffer = m_nextToken;
0118         if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) {
0119             nextTokenBuffer = escapeHtml(nextTokenBuffer);
0120         } else if (m_nextTokenType == Type::Tag) {
0121             if (!isAllowedTag(getTagType())) {
0122                 nextTokenBuffer = QString();
0123             } else if ((getTagType() == QStringLiteral("br") && stripNewlines)) {
0124                 nextTokenBuffer = u' ';
0125             }
0126             nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer);
0127         }
0128 
0129         outputString.append(nextTokenBuffer);
0130 
0131         nextTokenType();
0132     }
0133 
0134     // If the message is an emote add the user pill to the front of the message.
0135     if (event != nullptr) {
0136         auto e = eventCast<const Quotient::RoomMessageEvent>(event);
0137         if (e->msgtype() == Quotient::MessageEventType::Emote) {
0138             auto author = static_cast<NeoChatUser *>(room->user(e->senderId()));
0139             QString emoteString = QStringLiteral("* <a href=\"https://matrix.to/#/") + e->senderId() + QStringLiteral("\" style=\"color:")
0140                 + author->color().name() + QStringLiteral("\">") + author->displayname(room) + QStringLiteral("</a> ");
0141             if (outputString.startsWith(QStringLiteral("<p>"))) {
0142                 outputString.insert(3, emoteString);
0143             } else {
0144                 outputString.prepend(emoteString);
0145             }
0146         }
0147     }
0148 
0149     if (auto e = eventCast<const Quotient::RoomMessageEvent>(event)) {
0150         bool isEdited =
0151             !e->unsignedJson().isEmpty() && e->unsignedJson().contains("m.relations") && e->unsignedJson()["m.relations"].toObject().contains("m.replace");
0152         if (isEdited) {
0153             Kirigami::PlatformTheme *theme = static_cast<Kirigami::PlatformTheme *>(qmlAttachedPropertiesObject<Kirigami::PlatformTheme>(this, true));
0154 
0155             QString editTextColor;
0156             if (theme != nullptr) {
0157                 editTextColor = theme->disabledTextColor().name();
0158             } else {
0159                 editTextColor = QStringLiteral("#000000");
0160             }
0161             QString editedString = QStringLiteral(" <span style=\"color:") + editTextColor + QStringLiteral("\">(edited)</span>");
0162             if (outputString.endsWith(QStringLiteral("</p>"))) {
0163                 outputString.insert(outputString.length() - 4, editedString);
0164             } else if (outputString.endsWith(QStringLiteral("</pre>")) || outputString.endsWith(QStringLiteral("</blockquote>"))
0165                        || outputString.endsWith(QStringLiteral("</table>")) || outputString.endsWith(QStringLiteral("</ol>"))
0166                        || outputString.endsWith(QStringLiteral("</ul>"))) {
0167                 outputString.append("<p>" + editedString + "</p>");
0168             } else {
0169                 outputString.append(editedString);
0170             }
0171         }
0172     }
0173 
0174     /**
0175      * Replace <del> with <s>
0176      * Note: <s> is still not a valid tag for the message from the server. We
0177      * convert as that is what is needed for Qt::RichText.
0178      */
0179     outputString.replace(TextRegex::strikethrough, QStringLiteral("<s>\\1</s>"));
0180     return outputString;
0181 }
0182 
0183 QString TextHandler::handleRecievePlainText(Qt::TextFormat inputFormat, const bool &stripNewlines)
0184 {
0185     m_pos = 0;
0186     m_dataBuffer = m_data;
0187 
0188     // Strip mx-reply if present.
0189     m_dataBuffer.remove(TextRegex::removeRichReply);
0190 
0191     // Escaping then unescaping allows < and > to be maintained in a plain text string
0192     // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely.
0193     if (inputFormat == Qt::PlainText) {
0194         m_dataBuffer = escapeHtml(m_dataBuffer);
0195     }
0196 
0197     /**
0198      * This seems counterproductive but by converting any markup which could
0199      * arrive (e.g. in a caption body) it can then be stripped by the same code.
0200      */
0201     m_dataBuffer = markdownToHTML(m_dataBuffer);
0202 
0203     if (stripNewlines) {
0204         m_dataBuffer.replace(QStringLiteral("<br>\n"), QStringLiteral(" "));
0205         m_dataBuffer.replace(QStringLiteral("<br>"), QStringLiteral(" "));
0206         m_dataBuffer.replace(QStringLiteral("<br />\n"), QStringLiteral(" "));
0207         m_dataBuffer.replace(QStringLiteral("<br />"), QStringLiteral(" "));
0208         m_dataBuffer.replace(u'\n', QStringLiteral(" "));
0209         m_dataBuffer.replace(u'\u2028', " ");
0210     }
0211 
0212     // Strip all tags/attributes except code blocks which will be escaped.
0213     QString outputString;
0214     nextTokenType();
0215     while (m_pos < m_dataBuffer.length()) {
0216         next();
0217 
0218         QString nextTokenBuffer = m_nextToken;
0219         if (m_nextTokenType == Type::TextCode) {
0220             nextTokenBuffer = unescapeHtml(nextTokenBuffer);
0221         } else if (m_nextTokenType == Type::Tag) {
0222             nextTokenBuffer = QString();
0223         }
0224 
0225         outputString.append(nextTokenBuffer);
0226 
0227         nextTokenType();
0228     }
0229 
0230     // Escaping then unescaping allows < and > to be maintained in a plain text string
0231     // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely.
0232     outputString = unescapeHtml(outputString);
0233 
0234     outputString = outputString.trimmed();
0235     return outputString;
0236 }
0237 
0238 void TextHandler::next()
0239 {
0240     QString searchStr;
0241     if (m_nextTokenType == Type::Tag) {
0242         searchStr = u'>';
0243     } else if (m_nextTokenType == Type::TextCode) {
0244         // Anything between code tags is assumed to be plain text
0245         searchStr = QStringLiteral("</code>");
0246     } else {
0247         searchStr = u'<';
0248     }
0249 
0250     int tokenEnd = m_dataBuffer.indexOf(searchStr, m_pos + 1);
0251     if (tokenEnd == -1) {
0252         tokenEnd = m_dataBuffer.length();
0253     }
0254 
0255     m_nextToken = m_dataBuffer.mid(m_pos, tokenEnd - m_pos + (m_nextTokenType == Type::Tag ? 1 : 0));
0256     m_pos = tokenEnd + (m_nextTokenType == Type::Tag ? 1 : 0);
0257 }
0258 
0259 void TextHandler::nextTokenType()
0260 {
0261     if (m_pos >= m_dataBuffer.length()) {
0262         // This is to stop the function accessing an index outside the length of
0263         // m_dataBuffer during the final loop.
0264         m_nextTokenType = Type::End;
0265     } else if (m_nextTokenType == Type::Tag && getTagType() == QStringLiteral("code") && !isCloseTag()
0266                && m_dataBuffer.indexOf(QStringLiteral("</code>"), m_pos) != m_pos) {
0267         m_nextTokenType = Type::TextCode;
0268     } else if (m_dataBuffer[m_pos] == u'<' && m_dataBuffer[m_pos + 1] != u' ') {
0269         m_nextTokenType = Type::Tag;
0270     } else {
0271         m_nextTokenType = Type::Text;
0272     }
0273 }
0274 
0275 QString TextHandler::getTagType() const
0276 {
0277     if (m_nextToken.isEmpty()) {
0278         return QString();
0279     }
0280     const int tagTypeStart = m_nextToken[1] == u'/' ? 2 : 1;
0281     const int tagTypeEnd = m_nextToken.indexOf(TextRegex::endTagType, tagTypeStart);
0282     return m_nextToken.mid(tagTypeStart, tagTypeEnd - tagTypeStart);
0283 }
0284 
0285 bool TextHandler::isCloseTag() const
0286 {
0287     if (m_nextToken.isEmpty()) {
0288         return false;
0289     }
0290     return m_nextToken[1] == u'/';
0291 }
0292 
0293 QString TextHandler::getAttributeType(const QString &string)
0294 {
0295     if (!string.contains(u'=')) {
0296         return string;
0297     }
0298     const int equalsPos = string.indexOf(u'=');
0299     return string.left(equalsPos);
0300 }
0301 
0302 QString TextHandler::getAttributeData(const QString &string)
0303 {
0304     if (!string.contains(u'=')) {
0305         return QStringLiteral();
0306     }
0307     const int equalsPos = string.indexOf(u'=');
0308     return string.right(string.length() - equalsPos - 1);
0309 }
0310 
0311 bool TextHandler::isAllowedTag(const QString &type)
0312 {
0313     return allowedTags.contains(type);
0314 }
0315 
0316 bool TextHandler::isAllowedAttribute(const QString &tag, const QString &attribute)
0317 {
0318     return allowedAttributes[tag].contains(attribute);
0319 }
0320 
0321 bool TextHandler::isAllowedLink(const QString &link, bool isImg)
0322 {
0323     const QUrl linkUrl = QUrl(link);
0324 
0325     if (isImg) {
0326         return !linkUrl.isRelative() && linkUrl.scheme() == "mxc";
0327     } else {
0328         return !linkUrl.isRelative() && allowedLinkSchemes.contains(linkUrl.scheme());
0329     }
0330 }
0331 
0332 QString TextHandler::cleanAttributes(const QString &tag, const QString &tagString)
0333 {
0334     int nextAttributeIndex = tagString.indexOf(u' ', 1);
0335 
0336     if (nextAttributeIndex != -1) {
0337         QString outputString = tagString.left(nextAttributeIndex);
0338         QString nextAttribute;
0339         int nextSpaceIndex;
0340         nextAttributeIndex += 1;
0341 
0342         while (nextAttributeIndex < tagString.length()) {
0343             nextSpaceIndex = tagString.indexOf(TextRegex::endTagType, nextAttributeIndex);
0344             if (nextSpaceIndex == -1) {
0345                 nextSpaceIndex = tagString.length();
0346             }
0347             nextAttribute = tagString.mid(nextAttributeIndex, nextSpaceIndex - nextAttributeIndex);
0348 
0349             if (isAllowedAttribute(tag, getAttributeType(nextAttribute))) {
0350                 if (tag == QStringLiteral("img") && getAttributeType(nextAttribute) == QStringLiteral("src")) {
0351                     QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1);
0352                     if (isAllowedLink(attributeData, true)) {
0353                         outputString.append(u' ' + nextAttribute);
0354                     }
0355                 } else if (tag == u'a' && getAttributeType(nextAttribute) == QStringLiteral("href")) {
0356                     QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1);
0357                     if (isAllowedLink(attributeData)) {
0358                         outputString.append(u' ' + nextAttribute);
0359                     }
0360                 } else if (tag == QStringLiteral("code") && getAttributeType(nextAttribute) == QStringLiteral("class")) {
0361                     if (getAttributeData(nextAttribute).remove(u'"').startsWith(QStringLiteral("language-"))) {
0362                         outputString.append(u' ' + nextAttribute);
0363                     }
0364                 } else {
0365                     outputString.append(u' ' + nextAttribute);
0366                 }
0367             }
0368             nextAttributeIndex = nextSpaceIndex + 1;
0369         }
0370 
0371         outputString += u'>';
0372         return outputString;
0373     }
0374 
0375     return tagString;
0376 }
0377 
0378 QString TextHandler::markdownToHTML(const QString &markdown)
0379 {
0380     const auto str = markdown.toUtf8();
0381     char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE);
0382 
0383     const std::string html(tmp_buf);
0384 
0385     free(tmp_buf);
0386 
0387     auto result = QString::fromStdString(html).trimmed();
0388 
0389     result.replace(QStringLiteral("<!-- raw HTML omitted -->"), QString());
0390 
0391     return result;
0392 }
0393 
0394 /**
0395  * TODO: make this more intelligent currently other characters are not escaped
0396  * especially & as this can conflict with the cmark markdown to html conversion
0397  * which already escapes characters in code blocks. The < > still need to be handled
0398  * when the user manually types in the html.
0399  */
0400 QString TextHandler::escapeHtml(QString stringIn)
0401 {
0402     stringIn.replace(u'<', QStringLiteral("&lt;"));
0403     stringIn.replace(u'>', QStringLiteral("&gt;"));
0404     return stringIn;
0405 }
0406 
0407 QString TextHandler::unescapeHtml(QString stringIn)
0408 {
0409     // For those situations where brackets in code block get double escaped
0410     stringIn.replace(QStringLiteral("&amp;lt;"), QStringLiteral("<"));
0411     stringIn.replace(QStringLiteral("&amp;gt;"), QStringLiteral(">"));
0412     stringIn.replace(QStringLiteral("&lt;"), QStringLiteral("<"));
0413     stringIn.replace(QStringLiteral("&gt;"), QStringLiteral(">"));
0414     stringIn.replace(QStringLiteral("&amp;"), QStringLiteral("&"));
0415     stringIn.replace(QStringLiteral("&quot;"), QStringLiteral("\""));
0416     return stringIn;
0417 }
0418 
0419 QString TextHandler::linkifyUrls(QString stringIn)
0420 {
0421     stringIn = stringIn.replace(TextRegex::mxId, QStringLiteral(R"(\1<a href="https://matrix.to/#/\2">\2</a>)"));
0422     stringIn.replace(TextRegex::plainUrl, QStringLiteral(R"(<a href="\1">\1</a>)"));
0423     stringIn = stringIn.replace(TextRegex::emailAddress, QStringLiteral(R"(<a href="mailto:\2">\1\2</a>)"));
0424     return stringIn;
0425 }
0426 
0427 QList<QUrl> TextHandler::getLinkPreviews()
0428 {
0429     auto data = m_data.remove(TextRegex::removeRichReply);
0430     auto linksMatch = TextRegex::url.globalMatch(data);
0431     QList<QUrl> links;
0432     while (linksMatch.hasNext()) {
0433         auto link = linksMatch.next().captured();
0434         if (!link.contains(QStringLiteral("matrix.to"))) {
0435             links += QUrl(link);
0436         }
0437     }
0438     return links;
0439 }
0440 
0441 #include "moc_texthandler.cpp"