File indexing completed on 2024-10-06 07:36:10
0001 // SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com> 0002 // SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL 0003 0004 #include "texthandler.h" 0005 0006 #include <QDebug> 0007 #include <QGuiApplication> 0008 #include <QStringLiteral> 0009 #include <QUrl> 0010 0011 #include <Quotient/events/roommessageevent.h> 0012 #include <Quotient/util.h> 0013 #include <qstringliteral.h> 0014 0015 #include <cmark.h> 0016 0017 #include <Kirigami/Platform/PlatformTheme> 0018 0019 #include "models/customemojimodel.h" 0020 #include "utils.h" 0021 0022 static const QStringList allowedTags = { 0023 QStringLiteral("font"), QStringLiteral("del"), QStringLiteral("h1"), QStringLiteral("h2"), QStringLiteral("h3"), QStringLiteral("h4"), 0024 QStringLiteral("h5"), QStringLiteral("h6"), QStringLiteral("blockquote"), QStringLiteral("p"), QStringLiteral("a"), QStringLiteral("ul"), 0025 QStringLiteral("ol"), QStringLiteral("sup"), QStringLiteral("sub"), QStringLiteral("li"), QStringLiteral("b"), QStringLiteral("i"), 0026 QStringLiteral("u"), QStringLiteral("strong"), QStringLiteral("em"), QStringLiteral("strike"), QStringLiteral("code"), QStringLiteral("hr"), 0027 QStringLiteral("br"), QStringLiteral("div"), QStringLiteral("table"), QStringLiteral("thead"), QStringLiteral("tbody"), QStringLiteral("tr"), 0028 QStringLiteral("th"), QStringLiteral("td"), QStringLiteral("caption"), QStringLiteral("pre"), QStringLiteral("span"), QStringLiteral("img"), 0029 QStringLiteral("details"), QStringLiteral("summary")}; 0030 static const QHash<QString, QStringList> allowedAttributes = { 0031 {QStringLiteral("font"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("color")}}, 0032 {QStringLiteral("span"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("data-mx-spoiler")}}, 0033 {QStringLiteral("a"), {QStringLiteral("name"), QStringLiteral("target"), QStringLiteral("href")}}, 0034 {QStringLiteral("img"), {QStringLiteral("width"), QStringLiteral("height"), QStringLiteral("alt"), QStringLiteral("title"), QStringLiteral("src")}}, 0035 {QStringLiteral("ol"), {QStringLiteral("start")}}, 0036 {QStringLiteral("code"), {QStringLiteral("class")}}}; 0037 static const QStringList allowedLinkSchemes = {QStringLiteral("https"), 0038 QStringLiteral("http"), 0039 QStringLiteral("ftp"), 0040 QStringLiteral("mailto"), 0041 QStringLiteral("magnet")}; 0042 0043 QString TextHandler::data() const 0044 { 0045 return m_data; 0046 } 0047 0048 void TextHandler::setData(const QString &string) 0049 { 0050 m_data = string; 0051 m_pos = 0; 0052 } 0053 0054 QString TextHandler::handleSendText() 0055 { 0056 m_pos = 0; 0057 m_dataBuffer = markdownToHTML(m_data); 0058 0059 nextTokenType(); 0060 0061 // Strip any disallowed tags/attributes. 0062 QString outputString; 0063 while (m_pos < m_dataBuffer.length()) { 0064 next(); 0065 0066 QString nextTokenBuffer = m_nextToken; 0067 switch (m_nextTokenType) { 0068 case Text: 0069 nextTokenBuffer = escapeHtml(nextTokenBuffer); 0070 nextTokenBuffer = CustomEmojiModel::instance().preprocessText(nextTokenBuffer); 0071 break; 0072 case TextCode: 0073 nextTokenBuffer = escapeHtml(nextTokenBuffer); 0074 break; 0075 case Tag: 0076 if (!isAllowedTag(getTagType())) { 0077 nextTokenBuffer = QString(); 0078 } 0079 nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer); 0080 default: 0081 break; 0082 } 0083 0084 outputString.append(nextTokenBuffer); 0085 0086 nextTokenType(); 0087 } 0088 return outputString; 0089 } 0090 0091 QString TextHandler::handleRecieveRichText(Qt::TextFormat inputFormat, const NeoChatRoom *room, const Quotient::RoomEvent *event, bool stripNewlines) 0092 { 0093 m_pos = 0; 0094 m_dataBuffer = m_data; 0095 0096 // Strip mx-reply if present. 0097 m_dataBuffer.remove(TextRegex::removeRichReply); 0098 0099 // For plain text, convert links, escape html and convert line brakes. 0100 if (inputFormat == Qt::PlainText) { 0101 m_dataBuffer = escapeHtml(m_dataBuffer); 0102 m_dataBuffer.replace(u'\n', QStringLiteral("<br>")); 0103 } 0104 0105 // Linkify any plain text urls 0106 m_dataBuffer = linkifyUrls(m_dataBuffer); 0107 0108 // Apply user style 0109 m_dataBuffer.replace(TextRegex::userPill, QStringLiteral(R"(<b>\1</b>)")); 0110 0111 // Make all media URLs resolvable. 0112 if (room && event) { 0113 QRegularExpressionMatchIterator i = TextRegex::mxcImage.globalMatch(m_dataBuffer); 0114 while (i.hasNext()) { 0115 const QRegularExpressionMatch match = i.next(); 0116 const QUrl mediaUrl = room->makeMediaUrl(event->id(), QUrl(QStringLiteral("mxc://") + match.captured(2) + u'/' + match.captured(3))); 0117 m_dataBuffer.replace(match.captured(0), 0118 QStringLiteral("<img ") + match.captured(1) + QStringLiteral("src=\"") + mediaUrl.toString() + u'"' + match.captured(4) 0119 + u'>'); 0120 } 0121 } 0122 0123 // Strip any disallowed tags/attributes. 0124 QString outputString; 0125 nextTokenType(); 0126 while (m_pos < m_dataBuffer.length()) { 0127 next(); 0128 0129 QString nextTokenBuffer = m_nextToken; 0130 if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) { 0131 nextTokenBuffer = escapeHtml(nextTokenBuffer); 0132 } else if (m_nextTokenType == Type::Tag) { 0133 if (!isAllowedTag(getTagType())) { 0134 nextTokenBuffer = QString(); 0135 } else if ((getTagType() == QStringLiteral("br") && stripNewlines)) { 0136 nextTokenBuffer = u' '; 0137 } 0138 nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer); 0139 } 0140 0141 outputString.append(nextTokenBuffer); 0142 0143 nextTokenType(); 0144 } 0145 0146 // Apply user style to blockquotes 0147 // Unfortunately some attributes can be only be used on table cells, so we need to wrap the content in one. 0148 outputString.replace(TextRegex::blockQuote, QStringLiteral(R"(<blockquote><table><tr><td>“\1”</td></tr></table></blockquote>)")); 0149 0150 // If the message is an emote add the user pill to the front of the message. 0151 if (event != nullptr) { 0152 auto e = eventCast<const Quotient::RoomMessageEvent>(event); 0153 if (e->msgtype() == Quotient::MessageEventType::Emote) { 0154 auto author = room->user(e->senderId()); 0155 QString emoteString = QStringLiteral("* <a href=\"https://matrix.to/#/") + e->senderId() + QStringLiteral("\" style=\"color:") 0156 + Utils::getUserColor(author->hueF()).name() + QStringLiteral("\">") + author->displayname(room) + QStringLiteral("</a> "); 0157 if (outputString.startsWith(QStringLiteral("<p>"))) { 0158 outputString.insert(3, emoteString); 0159 } else { 0160 outputString.prepend(emoteString); 0161 } 0162 } 0163 } 0164 0165 if (auto e = eventCast<const Quotient::RoomMessageEvent>(event)) { 0166 bool isEdited = !e->unsignedJson().isEmpty() && e->unsignedJson().contains(QStringLiteral("m.relations")) 0167 && e->unsignedJson()[QStringLiteral("m.relations")].toObject().contains(QStringLiteral("m.replace")); 0168 if (isEdited) { 0169 Kirigami::Platform::PlatformTheme *theme = 0170 static_cast<Kirigami::Platform::PlatformTheme *>(qmlAttachedPropertiesObject<Kirigami::Platform::PlatformTheme>(this, true)); 0171 0172 QString editTextColor; 0173 if (theme != nullptr) { 0174 editTextColor = theme->disabledTextColor().name(); 0175 } else { 0176 editTextColor = QStringLiteral("#000000"); 0177 } 0178 QString editedString = QStringLiteral(" <span style=\"color:") + editTextColor + QStringLiteral("\">(edited)</span>"); 0179 if (outputString.endsWith(QStringLiteral("</p>"))) { 0180 outputString.insert(outputString.length() - 4, editedString); 0181 } else if (outputString.endsWith(QStringLiteral("</pre>")) || outputString.endsWith(QStringLiteral("</blockquote>")) 0182 || outputString.endsWith(QStringLiteral("</table>")) || outputString.endsWith(QStringLiteral("</ol>")) 0183 || outputString.endsWith(QStringLiteral("</ul>"))) { 0184 outputString.append(QStringLiteral("<p>%1</p>").arg(editedString)); 0185 } else { 0186 outputString.append(editedString); 0187 } 0188 } 0189 } 0190 0191 /** 0192 * Replace <del> with <s> 0193 * Note: <s> is still not a valid tag for the message from the server. We 0194 * convert as that is what is needed for Qt::RichText. 0195 */ 0196 outputString.replace(TextRegex::strikethrough, QStringLiteral("<s>\\1</s>")); 0197 return outputString; 0198 } 0199 0200 QString TextHandler::handleRecievePlainText(Qt::TextFormat inputFormat, const bool &stripNewlines) 0201 { 0202 m_pos = 0; 0203 m_dataBuffer = m_data; 0204 0205 // Strip mx-reply if present. 0206 m_dataBuffer.remove(TextRegex::removeRichReply); 0207 0208 // Escaping then unescaping allows < and > to be maintained in a plain text string 0209 // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely. 0210 if (inputFormat == Qt::PlainText) { 0211 m_dataBuffer = escapeHtml(m_dataBuffer); 0212 } 0213 0214 /** 0215 * This seems counterproductive but by converting any markup which could 0216 * arrive (e.g. in a caption body) it can then be stripped by the same code. 0217 */ 0218 m_dataBuffer = markdownToHTML(m_dataBuffer); 0219 // This is how \n is converted and for plain text we need it to just be <br> 0220 // to prevent extra newlines being inserted. 0221 m_dataBuffer.replace(QStringLiteral("<br />\n"), QStringLiteral("<br>")); 0222 0223 if (stripNewlines) { 0224 m_dataBuffer.replace(QStringLiteral("<br>\n"), QStringLiteral(" ")); 0225 m_dataBuffer.replace(QStringLiteral("<br>"), QStringLiteral(" ")); 0226 m_dataBuffer.replace(QStringLiteral("<br />\n"), QStringLiteral(" ")); 0227 m_dataBuffer.replace(QStringLiteral("<br />"), QStringLiteral(" ")); 0228 m_dataBuffer.replace(u'\n', QStringLiteral(" ")); 0229 m_dataBuffer.replace(u'\u2028', QStringLiteral(" ")); 0230 } 0231 0232 // Strip all tags/attributes except code blocks which will be escaped. 0233 QString outputString; 0234 nextTokenType(); 0235 while (m_pos < m_dataBuffer.length()) { 0236 next(); 0237 0238 QString nextTokenBuffer = m_nextToken; 0239 if (m_nextTokenType == Type::TextCode) { 0240 nextTokenBuffer = unescapeHtml(nextTokenBuffer); 0241 } else if (m_nextTokenType == Type::Tag) { 0242 if (getTagType() == QStringLiteral("br") && !stripNewlines) { 0243 nextTokenBuffer = u'\n'; 0244 } else { 0245 nextTokenBuffer = QString(); 0246 } 0247 } 0248 0249 outputString.append(nextTokenBuffer); 0250 0251 nextTokenType(); 0252 } 0253 0254 // Escaping then unescaping allows < and > to be maintained in a plain text string 0255 // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely. 0256 outputString = unescapeHtml(outputString); 0257 0258 outputString = outputString.trimmed(); 0259 return outputString; 0260 } 0261 0262 void TextHandler::next() 0263 { 0264 QString searchStr; 0265 if (m_nextTokenType == Type::Tag) { 0266 searchStr = u'>'; 0267 } else if (m_nextTokenType == Type::TextCode) { 0268 // Anything between code tags is assumed to be plain text 0269 searchStr = QStringLiteral("</code>"); 0270 } else { 0271 searchStr = u'<'; 0272 } 0273 0274 int tokenEnd = m_dataBuffer.indexOf(searchStr, m_pos + 1); 0275 if (tokenEnd == -1) { 0276 tokenEnd = m_dataBuffer.length(); 0277 } 0278 0279 m_nextToken = m_dataBuffer.mid(m_pos, tokenEnd - m_pos + (m_nextTokenType == Type::Tag ? 1 : 0)); 0280 m_pos = tokenEnd + (m_nextTokenType == Type::Tag ? 1 : 0); 0281 } 0282 0283 void TextHandler::nextTokenType() 0284 { 0285 if (m_pos >= m_dataBuffer.length()) { 0286 // This is to stop the function accessing an index outside the length of 0287 // m_dataBuffer during the final loop. 0288 m_nextTokenType = Type::End; 0289 } else if (m_nextTokenType == Type::Tag && getTagType() == QStringLiteral("code") && !isCloseTag() 0290 && m_dataBuffer.indexOf(QStringLiteral("</code>"), m_pos) != m_pos) { 0291 m_nextTokenType = Type::TextCode; 0292 } else if (m_dataBuffer[m_pos] == u'<' && m_dataBuffer[m_pos + 1] != u' ') { 0293 m_nextTokenType = Type::Tag; 0294 } else { 0295 m_nextTokenType = Type::Text; 0296 } 0297 } 0298 0299 QString TextHandler::getTagType() const 0300 { 0301 if (m_nextToken.isEmpty()) { 0302 return QString(); 0303 } 0304 const int tagTypeStart = m_nextToken[1] == u'/' ? 2 : 1; 0305 const int tagTypeEnd = m_nextToken.indexOf(TextRegex::endTagType, tagTypeStart); 0306 return m_nextToken.mid(tagTypeStart, tagTypeEnd - tagTypeStart); 0307 } 0308 0309 bool TextHandler::isCloseTag() const 0310 { 0311 if (m_nextToken.isEmpty()) { 0312 return false; 0313 } 0314 return m_nextToken[1] == u'/'; 0315 } 0316 0317 QString TextHandler::getAttributeType(const QString &string) 0318 { 0319 if (!string.contains(u'=')) { 0320 return string; 0321 } 0322 const int equalsPos = string.indexOf(u'='); 0323 return string.left(equalsPos); 0324 } 0325 0326 QString TextHandler::getAttributeData(const QString &string) 0327 { 0328 if (!string.contains(u'=')) { 0329 return QStringLiteral(); 0330 } 0331 const int equalsPos = string.indexOf(u'='); 0332 return string.right(string.length() - equalsPos - 1); 0333 } 0334 0335 bool TextHandler::isAllowedTag(const QString &type) 0336 { 0337 return allowedTags.contains(type); 0338 } 0339 0340 bool TextHandler::isAllowedAttribute(const QString &tag, const QString &attribute) 0341 { 0342 return allowedAttributes[tag].contains(attribute); 0343 } 0344 0345 bool TextHandler::isAllowedLink(const QString &link, bool isImg) 0346 { 0347 const QUrl linkUrl = QUrl(link); 0348 0349 if (isImg) { 0350 return !linkUrl.isRelative() && linkUrl.scheme() == QStringLiteral("mxc"); 0351 } else { 0352 return !linkUrl.isRelative() && allowedLinkSchemes.contains(linkUrl.scheme()); 0353 } 0354 } 0355 0356 QString TextHandler::cleanAttributes(const QString &tag, const QString &tagString) 0357 { 0358 int nextAttributeIndex = tagString.indexOf(u' ', 1); 0359 0360 if (nextAttributeIndex != -1) { 0361 QString outputString = tagString.left(nextAttributeIndex); 0362 QString nextAttribute; 0363 int nextSpaceIndex; 0364 nextAttributeIndex += 1; 0365 0366 while (nextAttributeIndex < tagString.length()) { 0367 nextSpaceIndex = tagString.indexOf(TextRegex::endTagType, nextAttributeIndex); 0368 if (nextSpaceIndex == -1) { 0369 nextSpaceIndex = tagString.length(); 0370 } 0371 nextAttribute = tagString.mid(nextAttributeIndex, nextSpaceIndex - nextAttributeIndex); 0372 0373 if (isAllowedAttribute(tag, getAttributeType(nextAttribute))) { 0374 if (tag == QStringLiteral("img") && getAttributeType(nextAttribute) == QStringLiteral("src")) { 0375 QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1); 0376 if (isAllowedLink(attributeData, true)) { 0377 outputString.append(u' ' + nextAttribute); 0378 } 0379 } else if (tag == u'a' && getAttributeType(nextAttribute) == QStringLiteral("href")) { 0380 QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1); 0381 if (isAllowedLink(attributeData)) { 0382 outputString.append(u' ' + nextAttribute); 0383 } 0384 } else if (tag == QStringLiteral("code") && getAttributeType(nextAttribute) == QStringLiteral("class")) { 0385 if (getAttributeData(nextAttribute).remove(u'"').startsWith(QStringLiteral("language-"))) { 0386 outputString.append(u' ' + nextAttribute); 0387 } 0388 } else { 0389 outputString.append(u' ' + nextAttribute); 0390 } 0391 } 0392 nextAttributeIndex = nextSpaceIndex + 1; 0393 } 0394 0395 outputString += u'>'; 0396 return outputString; 0397 } 0398 0399 return tagString; 0400 } 0401 0402 QString TextHandler::markdownToHTML(const QString &markdown) 0403 { 0404 const auto str = markdown.toUtf8(); 0405 char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE); 0406 0407 const std::string html(tmp_buf); 0408 0409 free(tmp_buf); 0410 0411 auto result = QString::fromStdString(html).trimmed(); 0412 0413 result.replace(QStringLiteral("<!-- raw HTML omitted -->"), QString()); 0414 0415 return result; 0416 } 0417 0418 /** 0419 * TODO: make this more intelligent currently other characters are not escaped 0420 * especially & as this can conflict with the cmark markdown to html conversion 0421 * which already escapes characters in code blocks. The < > still need to be handled 0422 * when the user manually types in the html. 0423 */ 0424 QString TextHandler::escapeHtml(QString stringIn) 0425 { 0426 stringIn.replace(u'<', QStringLiteral("<")); 0427 stringIn.replace(u'>', QStringLiteral(">")); 0428 return stringIn; 0429 } 0430 0431 QString TextHandler::unescapeHtml(QString stringIn) 0432 { 0433 // For those situations where brackets in code block get double escaped 0434 stringIn.replace(QStringLiteral("&lt;"), QStringLiteral("<")); 0435 stringIn.replace(QStringLiteral("&gt;"), QStringLiteral(">")); 0436 stringIn.replace(QStringLiteral("<"), QStringLiteral("<")); 0437 stringIn.replace(QStringLiteral(">"), QStringLiteral(">")); 0438 stringIn.replace(QStringLiteral("&"), QStringLiteral("&")); 0439 stringIn.replace(QStringLiteral("""), QStringLiteral("\"")); 0440 return stringIn; 0441 } 0442 0443 QString TextHandler::linkifyUrls(QString stringIn) 0444 { 0445 QRegularExpressionMatch match; 0446 int start = 0; 0447 for (int index = 0; index != -1; index = stringIn.indexOf(TextRegex::mxId, start, &match)) { 0448 int skip = 0; 0449 if (match.captured(0).size() > 0) { 0450 if (stringIn.left(index).count(QStringLiteral("<code>")) == stringIn.left(index).count(QStringLiteral("</code>"))) { 0451 auto replacement = QStringLiteral("<a href=\"https://matrix.to/#/%1\">%1</a>").arg(match.captured(2)); 0452 stringIn = stringIn.replace(index, match.captured(0).size(), replacement); 0453 } else { 0454 skip = match.captured().length(); 0455 } 0456 } 0457 start = index + skip; 0458 match = {}; 0459 } 0460 start = 0; 0461 match = {}; 0462 for (int index = 0; index != -1; index = stringIn.indexOf(TextRegex::plainUrl, start, &match)) { 0463 int skip = 0; 0464 if (match.captured(0).size() > 0) { 0465 if (stringIn.left(index).count(QStringLiteral("<code>")) == stringIn.left(index).count(QStringLiteral("</code>"))) { 0466 auto replacement = QStringLiteral("<a href=\"%1\">%1</a>").arg(match.captured(1)); 0467 stringIn = stringIn.replace(index, match.captured(0).size(), replacement); 0468 skip = replacement.length(); 0469 } else { 0470 skip = match.captured().length(); 0471 } 0472 } 0473 start = index + skip; 0474 match = {}; 0475 } 0476 start = 0; 0477 match = {}; 0478 for (int index = 0; index != -1; index = stringIn.indexOf(TextRegex::emailAddress, start, &match)) { 0479 int skip = 0; 0480 if (match.captured(0).size() > 0) { 0481 if (stringIn.left(index).count(QStringLiteral("<code>")) == stringIn.left(index).count(QStringLiteral("</code>"))) { 0482 auto replacement = QStringLiteral("<a href=\"mailto:%1\">%1</a>").arg(match.captured(2)); 0483 stringIn = stringIn.replace(index, match.captured(0).size(), replacement); 0484 skip = replacement.length(); 0485 } else { 0486 skip = match.captured().length(); 0487 } 0488 } 0489 start = index + skip; 0490 match = {}; 0491 } 0492 0493 return stringIn; 0494 } 0495 0496 #include "moc_texthandler.cpp"