File indexing completed on 2024-10-06 12:54:08

0001 // SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com>
0002 // SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0003 
0004 #pragma once
0005 
0006 #include <QHash>
0007 #include <QObject>
0008 #include <QRegularExpression>
0009 #include <QString>
0010 #include <QStringList>
0011 
0012 #include "neochatroom.h"
0013 
0014 namespace TextRegex
0015 {
0016 static const QRegularExpression endTagType{QStringLiteral("(>| )")};
0017 static const QRegularExpression attributeData{QStringLiteral("['\"](.*?)['\"]")};
0018 static const QRegularExpression removeReply{QStringLiteral("> <.*?>.*?\\n\\n"), QRegularExpression::DotMatchesEverythingOption};
0019 static const QRegularExpression removeRichReply{QStringLiteral("<mx-reply>.*?</mx-reply>"), QRegularExpression::DotMatchesEverythingOption};
0020 static const QRegularExpression codePill{QStringLiteral("<pre><code[^>]*>(.*?)</code></pre>"), QRegularExpression::DotMatchesEverythingOption};
0021 static const QRegularExpression userPill{QStringLiteral("(<a href=\"https://matrix.to/#/@.*?:.*?\">.*?</a>)"), QRegularExpression::DotMatchesEverythingOption};
0022 static const QRegularExpression strikethrough{QStringLiteral("<del>(.*?)</del>"), QRegularExpression::DotMatchesEverythingOption};
0023 static const QRegularExpression mxcImage{QStringLiteral(R"AAA(<img(.*?)src="mxc:\/\/(.*?)\/(.*?)"(.*?)>)AAA")};
0024 static const QRegularExpression plainUrl(
0025     QStringLiteral(
0026         R"(<a.*?<\/a>(*SKIP)(*F)|\b((www\.(?!\.)(?!(\w|\.|-)+@)|(https?|ftp):(//)?\w|(magnet|matrix):)(&(?![lg]t;)|[^&\s<>'"])+(&(?![lg]t;)|[^&!,.\s<>'"\]):])))"),
0027     QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
0028 static const QRegularExpression
0029     url(QStringLiteral(R"(\b((www\.(?!\.)(?!(\w|\.|-)+@)|https?:(//)?\w)(&(?![lg]t;)|[^&\s<>'"])+(&(?![lg]t;)|[^&!,.\s<>'"\]):])))"),
0030         QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
0031 static const QRegularExpression emailAddress(QStringLiteral(R"(<a.*?<\/a>(*SKIP)(*F)|\b(mailto:)?((\w|\.|-)+@(\w|\.|-)+\.\w+\b))"),
0032                                              QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
0033 static const QRegularExpression mxId(QStringLiteral(R"((^|[][[:space:](){}`'";])([!#@][-a-z0-9_=#/.]{1,252}:\w(?:\w|\.|-)*\.\w+(?::\d{1,5})?))"),
0034                                      QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
0035 }
0036 
0037 /**
0038  * @class TextHandler
0039  *
0040  * This class is designed to handle the text of both incoming and outgoing messages.
0041  *
0042  * This includes converting markdown to html and removing any html tags that shouldn't
0043  * be present as per the matrix spec
0044  * (https://spec.matrix.org/v1.5/client-server-api/#mroommessage-msgtypes).
0045  */
0046 class TextHandler : public QObject
0047 {
0048     Q_OBJECT
0049 
0050 public:
0051     /**
0052      * @brief List of token types
0053      */
0054     enum Type {
0055         Text, /*!< Anything not a tag that doesn't have special handling */
0056         Tag, /*!< For any generic tag that doesn't have special handling */
0057         TextCode, /*!< Text between code tags */
0058         End, /*!< End of the input string */
0059     };
0060 
0061     /**
0062      * @brief Get the string being handled.
0063      *
0064      * Setting new data resets the TextHandler.
0065      */
0066     QString data() const;
0067 
0068     /**
0069      * @brief Set the string being handled.
0070      *
0071      * @note The TextHandler doesn't modify the input data variable so the unhandled
0072      *       text can always be retrieved.
0073      */
0074     void setData(const QString &string);
0075 
0076     /**
0077      * @brief Handle the text for a message that is being sent.
0078      */
0079     QString handleSendText();
0080 
0081     /**
0082      * @brief Handle the text as a rich output for a message being received.
0083      *
0084      * The function does the following:
0085      *  - Removes invalid html tags and attributes
0086      *  - Strips any reply from the message
0087      *  - Formats user mentions
0088      *
0089      * @note In this case the rich text refers to the output format. The input
0090      *       can be in either and the parameter inputFormat just needs to be set
0091      *       appropriately.
0092      */
0093     QString handleRecieveRichText(Qt::TextFormat inputFormat = Qt::RichText,
0094                                   const NeoChatRoom *room = nullptr,
0095                                   const Quotient::RoomEvent *event = nullptr,
0096                                   bool stripNewlines = false);
0097 
0098     /**
0099      * @brief Handle the text as a plain output for a message being received.
0100      *
0101      * The function does the following:
0102      *  - Removes all html tags and attributes (except inside of code tags)
0103      *  - Strips any reply from the message
0104      *
0105      * @note In this case the plain text refers to the output format. The input
0106      *       can be in either and the parameter inputFormat just needs to be set
0107      *       appropriately.
0108      *
0109      * @warning The output of this function should NEVER be input into a rich text
0110      *          control. It will try to preserve < and > in the plain string which
0111      *          could be malicious tags if the control uses rich text format.
0112      */
0113     QString handleRecievePlainText(Qt::TextFormat inputFormat = Qt::PlainText, const bool &stripNewlines = false);
0114 
0115     /**
0116      * @brief Return a list of links that can be previewed.
0117      *
0118      * This function is designed to give only links that should be previewed so
0119      * http, https or something starting with www.
0120      */
0121     QList<QUrl> getLinkPreviews();
0122 
0123 private:
0124     QString m_data;
0125 
0126     QString m_dataBuffer;
0127     int m_pos;
0128     Type m_nextTokenType = Text;
0129     QString m_nextToken;
0130 
0131     void next();
0132     void nextTokenType();
0133 
0134     QString getTagType() const;
0135     bool isCloseTag() const;
0136     QString getAttributeType(const QString &string);
0137     QString getAttributeData(const QString &string);
0138     bool isAllowedTag(const QString &type);
0139     bool isAllowedAttribute(const QString &tag, const QString &attribute);
0140     bool isAllowedLink(const QString &link, bool isImg = false);
0141     QString cleanAttributes(const QString &tag, const QString &tagString);
0142 
0143     QString markdownToHTML(const QString &markdown);
0144     QString escapeHtml(QString stringIn);
0145     QString unescapeHtml(QString stringIn);
0146     QString linkifyUrls(QString stringIn);
0147 };