File indexing completed on 2023-09-24 11:47:44
0001 /* 0002 This file is part of the syndication library 0003 SPDX-FileCopyrightText: 2006 Frank Osterfeld <osterfeld@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "tools.h" 0009 #include "personimpl.h" 0010 0011 #include <KCharsets> 0012 0013 #include <QByteArray> 0014 #include <QCryptographicHash> 0015 #include <QDateTime> 0016 #include <QRegularExpression> 0017 0018 #include <ctime> 0019 0020 namespace Syndication 0021 { 0022 QCryptographicHash md5Machine(QCryptographicHash::Md5); 0023 0024 unsigned int calcHash(const QString &str) 0025 { 0026 return calcHash(str.toUtf8()); 0027 } 0028 0029 unsigned int calcHash(const QByteArray &array) 0030 { 0031 if (array.isEmpty()) { 0032 return 0; 0033 } else { 0034 const char *s = array.data(); 0035 unsigned int hash = 5381; 0036 int c; 0037 while ((c = *s++)) { 0038 hash = ((hash << 5) + hash) + c; // hash*33 + c 0039 } 0040 return hash; 0041 } 0042 } 0043 0044 static uint toTimeT(QDateTime &kdt) 0045 { 0046 if (kdt.isValid()) { 0047 // work around unspecified timezones/date-only timestamps by setting the time to 12:00 UTC 0048 if (kdt.time().isNull() // 0049 || (kdt.time() == QTime(0, 0) && kdt.timeSpec() == Qt::LocalTime)) { 0050 kdt.setTimeSpec(Qt::UTC); 0051 kdt.setTime(QTime(12, 0)); 0052 } 0053 return kdt.toMSecsSinceEpoch() / 1000; 0054 } else { 0055 return 0; 0056 } 0057 } 0058 0059 uint parseISODate(const QString &str) 0060 { 0061 QDateTime kdt = QDateTime::fromString(str, Qt::ISODate); 0062 return toTimeT(kdt); 0063 } 0064 0065 uint parseRFCDate(const QString &str) 0066 { 0067 QDateTime kdt = QDateTime::fromString(str, Qt::RFC2822Date); 0068 // Qt5 used to ignore invalid textual offsets but Qt6 rejects those, so handle that explictly 0069 if (!kdt.isValid() && str.endsWith(QLatin1String(" GMT"))) { 0070 kdt = QDateTime::fromString(QStringView(str).chopped(4), Qt::RFC2822Date); 0071 } 0072 return toTimeT(kdt); 0073 } 0074 0075 uint parseDate(const QString &str, DateFormat hint) 0076 { 0077 if (str.isEmpty()) { 0078 return 0; 0079 } 0080 0081 if (hint == RFCDate) { 0082 time_t t = parseRFCDate(str); 0083 return t != 0 ? t : parseISODate(str); 0084 } else { 0085 time_t t = parseISODate(str); 0086 return t != 0 ? t : parseRFCDate(str); 0087 } 0088 } 0089 0090 QString dateTimeToString(uint date) 0091 { 0092 if (date == 0) { 0093 return QString(); 0094 } 0095 0096 const QString format = QStringLiteral("ddd MMM d HH:mm:ss yyyy"); 0097 QDateTime dt; 0098 dt.setMSecsSinceEpoch(quint64(date) * 1000); 0099 return dt.toUTC().toString(format); 0100 } 0101 0102 QString calcMD5Sum(const QString &str) 0103 { 0104 md5Machine.reset(); 0105 md5Machine.addData(str.toUtf8()); 0106 return QLatin1String(md5Machine.result().toHex().constData()); 0107 } 0108 0109 QString resolveEntities(const QString &str) 0110 { 0111 return KCharsets::resolveEntities(str); 0112 } 0113 0114 QString escapeSpecialCharacters(const QString &strp) 0115 { 0116 QString str(strp); 0117 str.replace(QLatin1Char('&'), QLatin1String("&")); 0118 str.replace(QLatin1Char('\"'), QLatin1String(""")); 0119 str.replace(QLatin1Char('<'), QLatin1String("<")); 0120 str.replace(QLatin1Char('>'), QLatin1String(">")); 0121 str.replace(QLatin1Char('\''), QLatin1String("'")); 0122 return str.trimmed(); 0123 } 0124 0125 QString convertNewlines(const QString &strp) 0126 { 0127 QString str(strp); 0128 str.replace(QLatin1Char('\n'), QLatin1String("<br/>")); 0129 return str; 0130 } 0131 0132 QString plainTextToHtml(const QString &plainText) 0133 { 0134 QString str(plainText); 0135 str.replace(QLatin1Char('&'), QLatin1String("&")); 0136 str.replace(QLatin1Char('\"'), QLatin1String(""")); 0137 str.replace(QLatin1Char('<'), QLatin1String("<")); 0138 // str.replace(QLatin1Char('>'), QLatin1String(">")); 0139 str.replace(QLatin1Char('\n'), QLatin1String("<br/>")); 0140 return str.trimmed(); 0141 } 0142 0143 QString htmlToPlainText(const QString &html) 0144 { 0145 QString str(html); 0146 // TODO: preserve some formatting, such as line breaks 0147 str.remove(QRegularExpression(QStringLiteral("<[^>]*?>"))); // remove tags 0148 str = resolveEntities(str); 0149 return str.trimmed(); 0150 } 0151 0152 static QRegularExpression tagRegExp() 0153 { 0154 static QRegularExpression exp(QStringLiteral("<\\w+.*/?>")); 0155 return exp; 0156 } 0157 0158 bool stringContainsMarkup(const QString &str) 0159 { 0160 // check for entities 0161 if (str.contains(QRegularExpression(QStringLiteral("&[a-zA-Z0-9#]+;")))) { 0162 return true; 0163 } 0164 0165 const int ltc = str.count(QLatin1Char('<')); 0166 if (ltc == 0) { 0167 return false; 0168 } 0169 0170 return str.contains(tagRegExp()); 0171 } 0172 0173 bool isHtml(const QString &str) 0174 { 0175 // check for entities 0176 if (str.contains(QRegularExpression(QStringLiteral("&[a-zA-Z0-9#]+;")))) { 0177 return true; 0178 } 0179 0180 const int ltc = str.count(QLatin1Char('<')); 0181 if (ltc == 0) { 0182 return false; 0183 } 0184 0185 return str.contains(tagRegExp()); 0186 } 0187 0188 QString normalize(const QString &str) 0189 { 0190 return isHtml(str) ? str.trimmed() : plainTextToHtml(str); 0191 } 0192 0193 QString normalize(const QString &strp, bool isCDATA, bool containsMarkup) 0194 { 0195 if (containsMarkup) { 0196 return strp.trimmed(); 0197 } else { 0198 if (isCDATA) { 0199 QString str = resolveEntities(strp); 0200 str = escapeSpecialCharacters(str); 0201 str = convertNewlines(str); 0202 str = str.trimmed(); 0203 return str; 0204 } else { 0205 QString str = escapeSpecialCharacters(strp); 0206 str = str.trimmed(); 0207 return str; 0208 } 0209 } 0210 } 0211 0212 PersonPtr personFromString(const QString &strp) 0213 { 0214 QString str = strp.trimmed(); 0215 if (str.isEmpty()) { 0216 return PersonPtr(new PersonImpl()); 0217 } 0218 0219 str = resolveEntities(str); 0220 QString name; 0221 QString uri; 0222 QString email; 0223 0224 // look for something looking like a mail address ("foo@bar.com", 0225 // "<foo@bar.com>") and extract it 0226 0227 const QRegularExpression remail(QStringLiteral("<?([^@\\s<]+@[^>\\s]+)>?")); // FIXME: user "proper" regexp, 0228 // search kmail source for it 0229 0230 QRegularExpressionMatch match = remail.match(str); 0231 if (match.hasMatch()) { 0232 const QString all = match.captured(0); 0233 email = match.captured(1); 0234 str.remove(all); // remove mail address 0235 } 0236 0237 // replace "mailto", "(", ")" (to be extended) 0238 email.remove(QStringLiteral("mailto:")); 0239 email.remove(QRegularExpression(QStringLiteral("[()]"))); 0240 0241 // simplify the rest and use it as name 0242 0243 name = str.simplified(); 0244 0245 // after removing the email, str might have 0246 // the format "(Foo M. Bar)". We cut off 0247 // parentheses if there are any. However, if 0248 // str is of the format "Foo M. Bar (President)", 0249 // we should not cut anything. 0250 0251 QRegularExpression rename(QRegularExpression::anchoredPattern(QStringLiteral("^\\(([^)]*)\\)"))); 0252 match = rename.match(name); 0253 if (match.hasMatch()) { 0254 name = match.captured(1); 0255 } 0256 0257 name = name.isEmpty() ? QString() : name; 0258 email = email.isEmpty() ? QString() : email; 0259 uri = uri.isEmpty() ? QString() : uri; 0260 0261 if (name.isEmpty() && email.isEmpty() && uri.isEmpty()) { 0262 return PersonPtr(new PersonImpl()); 0263 } 0264 0265 return PersonPtr(new PersonImpl(name, uri, email)); 0266 } 0267 0268 ElementType::ElementType(const QString &localnamep, const QString &nsp) 0269 : ns(nsp) 0270 , localname(localnamep) 0271 { 0272 } 0273 0274 bool ElementType::operator==(const ElementType &other) const 0275 { 0276 return localname == other.localname && ns == other.ns; 0277 } 0278 0279 } // namespace Syndication