File indexing completed on 2024-10-06 10:23:57
0001 // SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com> 0002 // SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL 0003 0004 #include "texthandler.h" 0005 0006 #include <QDebug> 0007 #include <QUrl> 0008 #include <QStringLiteral> 0009 0010 #include <Quotient/events/roommessageevent.h> 0011 #include <Quotient/util.h> 0012 #include <qstringliteral.h> 0013 0014 #include <cmark.h> 0015 0016 #include <Kirigami/PlatformTheme> 0017 0018 static const QStringList allowedTags = { 0019 QStringLiteral("font"), QStringLiteral("del"), QStringLiteral("h1"), QStringLiteral("h2"), QStringLiteral("h3"), QStringLiteral("h4"), 0020 QStringLiteral("h5"), QStringLiteral("h6"), QStringLiteral("blockquote"), QStringLiteral("p"), QStringLiteral("a"), QStringLiteral("ul"), 0021 QStringLiteral("ol"), QStringLiteral("sup"), QStringLiteral("sub"), QStringLiteral("li"), QStringLiteral("b"), QStringLiteral("i"), 0022 QStringLiteral("u"), QStringLiteral("strong"), QStringLiteral("em"), QStringLiteral("strike"), QStringLiteral("code"), QStringLiteral("hr"), 0023 QStringLiteral("br"), QStringLiteral("div"), QStringLiteral("table"), QStringLiteral("thead"), QStringLiteral("tbody"), QStringLiteral("tr"), 0024 QStringLiteral("th"), QStringLiteral("td"), QStringLiteral("caption"), QStringLiteral("pre"), QStringLiteral("span"), QStringLiteral("img"), 0025 QStringLiteral("details"), QStringLiteral("summary")}; 0026 static const QHash<QString, QStringList> allowedAttributes = { 0027 {QStringLiteral("font"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("color")}}, 0028 {QStringLiteral("span"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("data-mx-spoiler")}}, 0029 {QStringLiteral("a"), {QStringLiteral("name"), QStringLiteral("target"), QStringLiteral("href")}}, 0030 {QStringLiteral("img"), {QStringLiteral("width"), QStringLiteral("height"), QStringLiteral("alt"), QStringLiteral("title"), QStringLiteral("src")}}, 0031 {QStringLiteral("ol"), {QStringLiteral("start")}}, 0032 {QStringLiteral("code"), {QStringLiteral("class")}}}; 0033 static const QStringList allowedLinkSchemes = {QStringLiteral("https"), 0034 QStringLiteral("http"), 0035 QStringLiteral("ftp"), 0036 QStringLiteral("mailto"), 0037 QStringLiteral("magnet")}; 0038 0039 QString TextHandler::data() const 0040 { 0041 return m_data; 0042 } 0043 0044 void TextHandler::setData(const QString &string) 0045 { 0046 m_data = string; 0047 m_pos = 0; 0048 } 0049 0050 QString TextHandler::handleSendText() 0051 { 0052 m_pos = 0; 0053 m_dataBuffer = markdownToHTML(m_data); 0054 0055 nextTokenType(); 0056 0057 // Strip any disallowed tags/attributes. 0058 QString outputString; 0059 while (m_pos < m_dataBuffer.length()) { 0060 next(); 0061 0062 QString nextTokenBuffer = m_nextToken; 0063 if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) { 0064 nextTokenBuffer = escapeHtml(nextTokenBuffer); 0065 } else if (m_nextTokenType == Type::Tag) { 0066 if (!isAllowedTag(getTagType())) { 0067 nextTokenBuffer = QString(); 0068 } 0069 nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer); 0070 } 0071 0072 outputString.append(nextTokenBuffer); 0073 0074 nextTokenType(); 0075 } 0076 return outputString; 0077 } 0078 0079 QString TextHandler::handleRecieveRichText(Qt::TextFormat inputFormat, const NeoChatRoom *room, const Quotient::RoomEvent *event, bool stripNewlines) 0080 { 0081 m_pos = 0; 0082 m_dataBuffer = m_data; 0083 0084 // Strip mx-reply if present. 0085 m_dataBuffer.remove(TextRegex::removeRichReply); 0086 0087 // For plain text, convert links, escape html and convert line brakes. 0088 if (inputFormat == Qt::PlainText) { 0089 m_dataBuffer = escapeHtml(m_dataBuffer); 0090 m_dataBuffer.replace(u'\n', QStringLiteral("<br>")); 0091 } 0092 0093 // Linkify any plain text urls 0094 m_dataBuffer = linkifyUrls(m_dataBuffer); 0095 0096 // Apply user style 0097 m_dataBuffer.replace(TextRegex::userPill, QStringLiteral(R"(<b>\1</b>)")); 0098 0099 // Make all media URLs resolvable. 0100 if (room && event) { 0101 QRegularExpressionMatchIterator i = TextRegex::mxcImage.globalMatch(m_dataBuffer); 0102 while (i.hasNext()) { 0103 const QRegularExpressionMatch match = i.next(); 0104 const QUrl mediaUrl = room->makeMediaUrl(event->id(), QUrl(QStringLiteral("mxc://") + match.captured(2) + u'/' + match.captured(3))); 0105 m_dataBuffer.replace(match.captured(0), 0106 QStringLiteral("<img ") + match.captured(1) + QStringLiteral("src=\"") + mediaUrl.toString() + u'"' + match.captured(4) 0107 + u'>'); 0108 } 0109 } 0110 0111 // Strip any disallowed tags/attributes. 0112 QString outputString; 0113 nextTokenType(); 0114 while (m_pos < m_dataBuffer.length()) { 0115 next(); 0116 0117 QString nextTokenBuffer = m_nextToken; 0118 if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) { 0119 nextTokenBuffer = escapeHtml(nextTokenBuffer); 0120 } else if (m_nextTokenType == Type::Tag) { 0121 if (!isAllowedTag(getTagType())) { 0122 nextTokenBuffer = QString(); 0123 } else if ((getTagType() == QStringLiteral("br") && stripNewlines)) { 0124 nextTokenBuffer = u' '; 0125 } 0126 nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer); 0127 } 0128 0129 outputString.append(nextTokenBuffer); 0130 0131 nextTokenType(); 0132 } 0133 0134 // If the message is an emote add the user pill to the front of the message. 0135 if (event != nullptr) { 0136 auto e = eventCast<const Quotient::RoomMessageEvent>(event); 0137 if (e->msgtype() == Quotient::MessageEventType::Emote) { 0138 auto author = static_cast<NeoChatUser *>(room->user(e->senderId())); 0139 QString emoteString = QStringLiteral("* <a href=\"https://matrix.to/#/") + e->senderId() + QStringLiteral("\" style=\"color:") 0140 + author->color().name() + QStringLiteral("\">") + author->displayname(room) + QStringLiteral("</a> "); 0141 if (outputString.startsWith(QStringLiteral("<p>"))) { 0142 outputString.insert(3, emoteString); 0143 } else { 0144 outputString.prepend(emoteString); 0145 } 0146 } 0147 } 0148 0149 if (auto e = eventCast<const Quotient::RoomMessageEvent>(event)) { 0150 bool isEdited = 0151 !e->unsignedJson().isEmpty() && e->unsignedJson().contains("m.relations") && e->unsignedJson()["m.relations"].toObject().contains("m.replace"); 0152 if (isEdited) { 0153 Kirigami::PlatformTheme *theme = static_cast<Kirigami::PlatformTheme *>(qmlAttachedPropertiesObject<Kirigami::PlatformTheme>(this, true)); 0154 0155 QString editTextColor; 0156 if (theme != nullptr) { 0157 editTextColor = theme->disabledTextColor().name(); 0158 } else { 0159 editTextColor = QStringLiteral("#000000"); 0160 } 0161 QString editedString = QStringLiteral(" <span style=\"color:") + editTextColor + QStringLiteral("\">(edited)</span>"); 0162 if (outputString.endsWith(QStringLiteral("</p>"))) { 0163 outputString.insert(outputString.length() - 4, editedString); 0164 } else if (outputString.endsWith(QStringLiteral("</pre>")) || outputString.endsWith(QStringLiteral("</blockquote>")) 0165 || outputString.endsWith(QStringLiteral("</table>")) || outputString.endsWith(QStringLiteral("</ol>")) 0166 || outputString.endsWith(QStringLiteral("</ul>"))) { 0167 outputString.append("<p>" + editedString + "</p>"); 0168 } else { 0169 outputString.append(editedString); 0170 } 0171 } 0172 } 0173 0174 /** 0175 * Replace <del> with <s> 0176 * Note: <s> is still not a valid tag for the message from the server. We 0177 * convert as that is what is needed for Qt::RichText. 0178 */ 0179 outputString.replace(TextRegex::strikethrough, QStringLiteral("<s>\\1</s>")); 0180 return outputString; 0181 } 0182 0183 QString TextHandler::handleRecievePlainText(Qt::TextFormat inputFormat, const bool &stripNewlines) 0184 { 0185 m_pos = 0; 0186 m_dataBuffer = m_data; 0187 0188 // Strip mx-reply if present. 0189 m_dataBuffer.remove(TextRegex::removeRichReply); 0190 0191 // Escaping then unescaping allows < and > to be maintained in a plain text string 0192 // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely. 0193 if (inputFormat == Qt::PlainText) { 0194 m_dataBuffer = escapeHtml(m_dataBuffer); 0195 } 0196 0197 /** 0198 * This seems counterproductive but by converting any markup which could 0199 * arrive (e.g. in a caption body) it can then be stripped by the same code. 0200 */ 0201 m_dataBuffer = markdownToHTML(m_dataBuffer); 0202 0203 if (stripNewlines) { 0204 m_dataBuffer.replace(QStringLiteral("<br>\n"), QStringLiteral(" ")); 0205 m_dataBuffer.replace(QStringLiteral("<br>"), QStringLiteral(" ")); 0206 m_dataBuffer.replace(QStringLiteral("<br />\n"), QStringLiteral(" ")); 0207 m_dataBuffer.replace(QStringLiteral("<br />"), QStringLiteral(" ")); 0208 m_dataBuffer.replace(u'\n', QStringLiteral(" ")); 0209 m_dataBuffer.replace(u'\u2028', " "); 0210 } 0211 0212 // Strip all tags/attributes except code blocks which will be escaped. 0213 QString outputString; 0214 nextTokenType(); 0215 while (m_pos < m_dataBuffer.length()) { 0216 next(); 0217 0218 QString nextTokenBuffer = m_nextToken; 0219 if (m_nextTokenType == Type::TextCode) { 0220 nextTokenBuffer = unescapeHtml(nextTokenBuffer); 0221 } else if (m_nextTokenType == Type::Tag) { 0222 nextTokenBuffer = QString(); 0223 } 0224 0225 outputString.append(nextTokenBuffer); 0226 0227 nextTokenType(); 0228 } 0229 0230 // Escaping then unescaping allows < and > to be maintained in a plain text string 0231 // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely. 0232 outputString = unescapeHtml(outputString); 0233 0234 outputString = outputString.trimmed(); 0235 return outputString; 0236 } 0237 0238 void TextHandler::next() 0239 { 0240 QString searchStr; 0241 if (m_nextTokenType == Type::Tag) { 0242 searchStr = u'>'; 0243 } else if (m_nextTokenType == Type::TextCode) { 0244 // Anything between code tags is assumed to be plain text 0245 searchStr = QStringLiteral("</code>"); 0246 } else { 0247 searchStr = u'<'; 0248 } 0249 0250 int tokenEnd = m_dataBuffer.indexOf(searchStr, m_pos + 1); 0251 if (tokenEnd == -1) { 0252 tokenEnd = m_dataBuffer.length(); 0253 } 0254 0255 m_nextToken = m_dataBuffer.mid(m_pos, tokenEnd - m_pos + (m_nextTokenType == Type::Tag ? 1 : 0)); 0256 m_pos = tokenEnd + (m_nextTokenType == Type::Tag ? 1 : 0); 0257 } 0258 0259 void TextHandler::nextTokenType() 0260 { 0261 if (m_pos >= m_dataBuffer.length()) { 0262 // This is to stop the function accessing an index outside the length of 0263 // m_dataBuffer during the final loop. 0264 m_nextTokenType = Type::End; 0265 } else if (m_nextTokenType == Type::Tag && getTagType() == QStringLiteral("code") && !isCloseTag() 0266 && m_dataBuffer.indexOf(QStringLiteral("</code>"), m_pos) != m_pos) { 0267 m_nextTokenType = Type::TextCode; 0268 } else if (m_dataBuffer[m_pos] == u'<' && m_dataBuffer[m_pos + 1] != u' ') { 0269 m_nextTokenType = Type::Tag; 0270 } else { 0271 m_nextTokenType = Type::Text; 0272 } 0273 } 0274 0275 QString TextHandler::getTagType() const 0276 { 0277 if (m_nextToken.isEmpty()) { 0278 return QString(); 0279 } 0280 const int tagTypeStart = m_nextToken[1] == u'/' ? 2 : 1; 0281 const int tagTypeEnd = m_nextToken.indexOf(TextRegex::endTagType, tagTypeStart); 0282 return m_nextToken.mid(tagTypeStart, tagTypeEnd - tagTypeStart); 0283 } 0284 0285 bool TextHandler::isCloseTag() const 0286 { 0287 if (m_nextToken.isEmpty()) { 0288 return false; 0289 } 0290 return m_nextToken[1] == u'/'; 0291 } 0292 0293 QString TextHandler::getAttributeType(const QString &string) 0294 { 0295 if (!string.contains(u'=')) { 0296 return string; 0297 } 0298 const int equalsPos = string.indexOf(u'='); 0299 return string.left(equalsPos); 0300 } 0301 0302 QString TextHandler::getAttributeData(const QString &string) 0303 { 0304 if (!string.contains(u'=')) { 0305 return QStringLiteral(); 0306 } 0307 const int equalsPos = string.indexOf(u'='); 0308 return string.right(string.length() - equalsPos - 1); 0309 } 0310 0311 bool TextHandler::isAllowedTag(const QString &type) 0312 { 0313 return allowedTags.contains(type); 0314 } 0315 0316 bool TextHandler::isAllowedAttribute(const QString &tag, const QString &attribute) 0317 { 0318 return allowedAttributes[tag].contains(attribute); 0319 } 0320 0321 bool TextHandler::isAllowedLink(const QString &link, bool isImg) 0322 { 0323 const QUrl linkUrl = QUrl(link); 0324 0325 if (isImg) { 0326 return !linkUrl.isRelative() && linkUrl.scheme() == "mxc"; 0327 } else { 0328 return !linkUrl.isRelative() && allowedLinkSchemes.contains(linkUrl.scheme()); 0329 } 0330 } 0331 0332 QString TextHandler::cleanAttributes(const QString &tag, const QString &tagString) 0333 { 0334 int nextAttributeIndex = tagString.indexOf(u' ', 1); 0335 0336 if (nextAttributeIndex != -1) { 0337 QString outputString = tagString.left(nextAttributeIndex); 0338 QString nextAttribute; 0339 int nextSpaceIndex; 0340 nextAttributeIndex += 1; 0341 0342 while (nextAttributeIndex < tagString.length()) { 0343 nextSpaceIndex = tagString.indexOf(TextRegex::endTagType, nextAttributeIndex); 0344 if (nextSpaceIndex == -1) { 0345 nextSpaceIndex = tagString.length(); 0346 } 0347 nextAttribute = tagString.mid(nextAttributeIndex, nextSpaceIndex - nextAttributeIndex); 0348 0349 if (isAllowedAttribute(tag, getAttributeType(nextAttribute))) { 0350 if (tag == QStringLiteral("img") && getAttributeType(nextAttribute) == QStringLiteral("src")) { 0351 QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1); 0352 if (isAllowedLink(attributeData, true)) { 0353 outputString.append(u' ' + nextAttribute); 0354 } 0355 } else if (tag == u'a' && getAttributeType(nextAttribute) == QStringLiteral("href")) { 0356 QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1); 0357 if (isAllowedLink(attributeData)) { 0358 outputString.append(u' ' + nextAttribute); 0359 } 0360 } else if (tag == QStringLiteral("code") && getAttributeType(nextAttribute) == QStringLiteral("class")) { 0361 if (getAttributeData(nextAttribute).remove(u'"').startsWith(QStringLiteral("language-"))) { 0362 outputString.append(u' ' + nextAttribute); 0363 } 0364 } else { 0365 outputString.append(u' ' + nextAttribute); 0366 } 0367 } 0368 nextAttributeIndex = nextSpaceIndex + 1; 0369 } 0370 0371 outputString += u'>'; 0372 return outputString; 0373 } 0374 0375 return tagString; 0376 } 0377 0378 QString TextHandler::markdownToHTML(const QString &markdown) 0379 { 0380 const auto str = markdown.toUtf8(); 0381 char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE); 0382 0383 const std::string html(tmp_buf); 0384 0385 free(tmp_buf); 0386 0387 auto result = QString::fromStdString(html).trimmed(); 0388 0389 result.replace(QStringLiteral("<!-- raw HTML omitted -->"), QString()); 0390 0391 return result; 0392 } 0393 0394 /** 0395 * TODO: make this more intelligent currently other characters are not escaped 0396 * especially & as this can conflict with the cmark markdown to html conversion 0397 * which already escapes characters in code blocks. The < > still need to be handled 0398 * when the user manually types in the html. 0399 */ 0400 QString TextHandler::escapeHtml(QString stringIn) 0401 { 0402 stringIn.replace(u'<', QStringLiteral("<")); 0403 stringIn.replace(u'>', QStringLiteral(">")); 0404 return stringIn; 0405 } 0406 0407 QString TextHandler::unescapeHtml(QString stringIn) 0408 { 0409 // For those situations where brackets in code block get double escaped 0410 stringIn.replace(QStringLiteral("&lt;"), QStringLiteral("<")); 0411 stringIn.replace(QStringLiteral("&gt;"), QStringLiteral(">")); 0412 stringIn.replace(QStringLiteral("<"), QStringLiteral("<")); 0413 stringIn.replace(QStringLiteral(">"), QStringLiteral(">")); 0414 stringIn.replace(QStringLiteral("&"), QStringLiteral("&")); 0415 stringIn.replace(QStringLiteral("""), QStringLiteral("\"")); 0416 return stringIn; 0417 } 0418 0419 QString TextHandler::linkifyUrls(QString stringIn) 0420 { 0421 stringIn = stringIn.replace(TextRegex::mxId, QStringLiteral(R"(\1<a href="https://matrix.to/#/\2">\2</a>)")); 0422 stringIn.replace(TextRegex::plainUrl, QStringLiteral(R"(<a href="\1">\1</a>)")); 0423 stringIn = stringIn.replace(TextRegex::emailAddress, QStringLiteral(R"(<a href="mailto:\2">\1\2</a>)")); 0424 return stringIn; 0425 } 0426 0427 QList<QUrl> TextHandler::getLinkPreviews() 0428 { 0429 auto data = m_data.remove(TextRegex::removeRichReply); 0430 auto linksMatch = TextRegex::url.globalMatch(data); 0431 QList<QUrl> links; 0432 while (linksMatch.hasNext()) { 0433 auto link = linksMatch.next().captured(); 0434 if (!link.contains(QStringLiteral("matrix.to"))) { 0435 links += QUrl(link); 0436 } 0437 } 0438 return links; 0439 } 0440 0441 #include "moc_texthandler.cpp"