File indexing completed on 2024-10-06 12:27:19

0001 /*
0002     This file is part of the syndication library
0003     SPDX-FileCopyrightText: 2006 Frank Osterfeld <osterfeld@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #ifndef SYNDICATION_TOOLS_H
0009 #define SYNDICATION_TOOLS_H
0010 
0011 #include "syndication_export.h"
0012 #include <syndication/person.h>
0013 
0014 #include <QString>
0015 
0016 class QByteArray;
0017 class QString;
0018 
0019 namespace Syndication
0020 {
0021 /** date formats supported by date parsers */
0022 
0023 enum DateFormat {
0024     ISODate, /**< ISO 8601 extended format.
0025               * (date: "2003-12-13",datetime: "2003-12-13T18:30:02.25",
0026               * datetime with timezone: "2003-12-13T18:30:02.25+01:00")
0027               */
0028     RFCDate, /**< RFC 822. (e.g. "Sat, 07 Sep 2002 00:00:01 GMT") */
0029 };
0030 
0031 /**
0032  * parses a date string in ISO 8601 extended format.
0033  * (date: "2003-12-13",datetime: "2003-12-13T18:30:02.25",
0034  * datetime with timezone: "2003-12-13T18:30:02.25+01:00")
0035  *
0036  * @param str a string in ISO 8601 format
0037  * @return parsed date in seconds since epoch, 0 if no date could
0038  * be parsed from the string.
0039  */
0040 SYNDICATION_EXPORT
0041 uint parseISODate(const QString &str);
0042 
0043 /**
0044  * parses a date string as defined in RFC 822.
0045  * (Sat, 07 Sep 2002 00:00:01 GMT)
0046  *
0047  * @param str a string in RFC 822 format
0048  * @return parsed date in seconds since epoch, 0 if no date could
0049  * be parsed from the string.
0050  */
0051 SYNDICATION_EXPORT
0052 uint parseRFCDate(const QString &str);
0053 
0054 /**
0055  * parses a date string in ISO (see parseISODate()) or RFC 822 (see
0056  * parseRFCDate()) format.
0057  * It tries both parsers and returns the first valid parsing result found (or 0
0058  * otherwise).
0059  * To speed up parsing, you can give a hint which format you expect.
0060  * The method will try the corresponding parser first then.
0061  *
0062  * @param str a date string
0063  * @param hint the expected format
0064  * @return parsed date in seconds since epoch, 0 if no date could
0065  * be parsed from the string.
0066  */
0067 SYNDICATION_EXPORT
0068 uint parseDate(const QString &str, DateFormat hint = RFCDate);
0069 
0070 /**
0071  * @internal
0072  * returns a string representation of a datetime.
0073  * this is used internally to create debugging output.
0074  *
0075  * @param date the date to convert
0076  * @return string representation of the date, or a null string if
0077  * @c date is 0
0078  */
0079 SYNDICATION_EXPORT
0080 QString dateTimeToString(uint date);
0081 
0082 /**
0083  * resolves entities to respective unicode chars.
0084  *
0085  * @param str a string
0086  */
0087 SYNDICATION_EXPORT
0088 QString resolveEntities(const QString &str);
0089 /**
0090  * replaces the characters &lt; >, &, ", '
0091  * with &amp;lt; &amp;gt; &amp;amp;, &amp;quot; &amp;apos;.
0092  * @param str the string to escape
0093  */
0094 SYNDICATION_EXPORT
0095 QString escapeSpecialCharacters(const QString &str);
0096 
0097 /**
0098  * replaces newlines ("\n") by &lt;br/>
0099  * @param str string to convert
0100  */
0101 SYNDICATION_EXPORT
0102 QString convertNewlines(const QString &str);
0103 
0104 /**
0105  * converts a plain text string to HTML
0106  *
0107  * @param plainText a string in plain text.
0108  */
0109 SYNDICATION_EXPORT
0110 QString plainTextToHtml(const QString &plainText);
0111 
0112 /**
0113  * converts a HTML string to plain text
0114  *
0115  * @param html string in HTML format
0116  * @return stripped text
0117  */
0118 SYNDICATION_EXPORT
0119 QString htmlToPlainText(const QString &html);
0120 
0121 /**
0122  * guesses whether a string contains plain text or HTML
0123  *
0124  * @param str the string in unknown format
0125  * @return @c true if the heuristic thinks it's HTML, @c false
0126  * if thinks it is plain text
0127  */
0128 SYNDICATION_EXPORT
0129 bool isHtml(const QString &str);
0130 
0131 /**
0132  * guesses whether a string contains (HTML) markup or not. This
0133  * implements not an exact check for valid HTML markup, but a
0134  * simple (and relatively fast) heuristic.
0135  *
0136  * @param str the string that might or might not contain markup
0137  * @return @c true if the heuristic thinks it contains markup, @c false
0138  * if thinks it is markup-free plain text
0139  */
0140 SYNDICATION_EXPORT
0141 bool stringContainsMarkup(const QString &str);
0142 
0143 /**
0144  * Ensures HTML formatting for a string.
0145  * guesses via isHtml() if @c str contains HTML or plain text, and returns
0146  * plainTextToHtml(str) if it thinks it is plain text, or the unmodified @c str
0147  * otherwise.
0148  *
0149  * @param str a string with unknown content
0150  * @return string as HTML (as long as the heuristics work)
0151  */
0152 SYNDICATION_EXPORT
0153 QString normalize(const QString &str);
0154 
0155 /**
0156  * normalizes a string based on feed-wide properties of tag content.
0157  * It is based on the assumption that all items in a feed encode their
0158  * title/description content in the same way (CDATA or not, plain text
0159  * vs. HTML). isCDATA and containsMarkup are determined once by the feed,
0160  * and then passed to this method.
0161  *
0162  * The returned string contains HTML, with special characters &lt;, >,
0163  * &, ", and ' escaped, and all other entities resolved.
0164  * Whitespace is collapsed, relevant whitespace is replaced by respective
0165  * HTML tags (&lt;br/>).
0166  *
0167  * @param str a string
0168  * @param isCDATA whether the feed uses CDATA for the tag @c str was read from
0169  * @param containsMarkup whether the feed uses HTML markup in the
0170  *        tag @c str was read from.
0171  * @return string as HTML (as long as the heuristics work)
0172  */
0173 SYNDICATION_EXPORT
0174 QString normalize(const QString &str, bool isCDATA, bool containsMarkup);
0175 
0176 /**
0177  * Parses a person object from a string by identifying name and email address
0178  * in the string. Currently detected variants are:
0179  * "foo@bar.com", "Foo", "Foo &lt;foo@bar.com>", "foo@bar.com (Foo)".
0180  *
0181  * @param str the string to parse the person from.
0182  * @return a Person object containing the parsed information.
0183  */
0184 SYNDICATION_EXPORT
0185 PersonPtr personFromString(const QString &str);
0186 
0187 /**
0188  * @internal
0189  * calculates a hash value for a string
0190  */
0191 unsigned int calcHash(const QString &str);
0192 
0193 /**
0194  * @internal
0195  * calculates a hash value for a byte array
0196  */
0197 unsigned int calcHash(const QByteArray &array);
0198 
0199 /**
0200  * @internal
0201  * calculates a md5 checksum for a string
0202  */
0203 QString calcMD5Sum(const QString &str);
0204 
0205 //@cond PRIVATE
0206 /**
0207  * @internal
0208  * used internally to represent element types
0209  */
0210 struct ElementType {
0211     ElementType(const QString &localnamep,
0212                 const QString &nsp = QString()); // implicit
0213 
0214     bool operator==(const ElementType &other) const;
0215 
0216     QString ns;
0217     QString localname;
0218 };
0219 //@endcond
0220 
0221 } // namespace Syndication
0222 
0223 #endif // SYNDICATION_TOOLS_H