kmime/src/kmime_util_p.cpp

0001 /*
0002   KMime, the KDE Internet mail/usenet news message library.
0003   SPDX-FileCopyrightText: 2001 the KMime authors.
0004   See file AUTHORS for details
0005
0006   SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008
0009 #include <config-kmime.h>
0010
0011 #include "kmime_util_p.h"
0012 #include "kmime_debug.h"
0013
0014 #include <QByteArray>
0015 #include <QChar>
0016 #include <QString>
0017
0018 #include <cctype>
0019
0020 using namespace KMime;
0021
0022 int KMime::findHeaderLineEnd(QByteArrayView src, int &dataBegin, bool *folded)
0023 {
0024     int end = dataBegin;
0025     int len = src.length() - 1;
0026
0027     if (folded) {
0028         *folded = false;
0029     }
0030
0031     if (dataBegin < 0) {
0032         // Not found
0033         return -1;
0034     }
0035
0036     if (dataBegin > len) {
0037         // No data available
0038         return len + 1;
0039     }
0040
0041     // If the first line contains nothing, but the next line starts with a space
0042     // or a tab, that means a stupid mail client has made the first header field line
0043     // entirely empty, and has folded the rest to the next line(s).
0044     if (src.at(end) == '\n' && end + 1 < len &&
0045             (src[end + 1] == ' ' || src[end + 1] == '\t')) {
0046
0047         // Skip \n and first whitespace
0048         dataBegin += 2;
0049         end += 2;
0050     }
0051
0052     if (src.at(end) != '\n') {      // check if the header is not empty
0053         while (true) {
0054             end = src.indexOf('\n', end + 1);
0055             if (end == -1 || end == len) {
0056                 // end of string
0057                 break;
0058             } else if (src[end + 1] == ' ' || src[end + 1] == '\t' ||
0059                        (src[end + 1] == '=' && end + 3 <= len &&
0060                         ((src[end + 2] == '0' && src[end + 3] == '9') ||
0061                          (src[end + 2] == '2' && src[end + 3] == '0')))) {
0062                 // next line is header continuation or starts with =09/=20 (bug #86302)
0063                 if (folded) {
0064                     *folded = true;
0065                 }
0066             } else {
0067                 // end of header (no header continuation)
0068                 break;
0069             }
0070         }
0071     }
0072
0073     if (end < 0) {
0074         end = len + 1; //take the rest of the string
0075     }
0076     return end;
0077 }
0078
0079 #if !HAVE_STRCASESTR
0080 #ifdef WIN32
0081 #define strncasecmp _strnicmp
0082 #endif
0083 static const char *strcasestr(const char *haystack, const char *needle)
0084 {
0085     /* Copied from libreplace as part of qtwebengine 5.5.1 */
0086     const char *s;
0087     size_t nlen = strlen(needle);
0088     for (s = haystack; *s; s++) {
0089         if (toupper(*needle) == toupper(*s) && strncasecmp(s, needle, nlen) == 0) {
0090             return (char *)((uintptr_t)s);
0091         }
0092     }
0093     return NULL;
0094 }
0095 #endif
0096
0097 int KMime::indexOfHeader(const QByteArray &src, const QByteArray &name, int &end, int &dataBegin, bool *folded)
0098 {
0099     QByteArray n = name;
0100     n.append(':');
0101     int begin = -1;
0102
0103     if (qstrnicmp(n.constData(), src.constData(), n.length()) == 0) {
0104         begin = 0;
0105     } else {
0106         n.prepend('\n');
0107         const char *p = strcasestr(src.constData(), n.constData());
0108         if (!p) {
0109             begin = -1;
0110         } else {
0111             begin = p - src.constData();
0112             ++begin;
0113         }
0114     }
0115
0116     if (begin > -1) {       //there is a header with the given name
0117         dataBegin = begin + name.length() + 1; //skip the name
0118         // skip the usual space after the colon
0119         if (dataBegin < src.length() && src.at(dataBegin) == ' ') {
0120             ++dataBegin;
0121         }
0122         end = findHeaderLineEnd(src, dataBegin, folded);
0123         return begin;
0124
0125     } else {
0126         end = -1;
0127         dataBegin = -1;
0128         return -1; //header not found
0129     }
0130 }
0131
0132 QByteArray KMime::extractHeader(const QByteArray &src, const QByteArray &name)
0133 {
0134     int begin;
0135     int end;
0136     bool folded;
0137     QByteArray result;
0138
0139     if (src.isEmpty() || indexOfHeader(src, name, end, begin, &folded) < 0) {
0140         return result;
0141     }
0142
0143     if (begin >= 0) {
0144         if (!folded) {
0145             result = src.mid(begin, end - begin);
0146         } else {
0147             if (end > begin) {
0148                 result = unfoldHeader(src.constData() + begin, end - begin);
0149             }
0150         }
0151     }
0152     return result;
0153 }
0154
0155 QByteArray KMime::unfoldHeader(const char *header, size_t headerSize)
0156 {
0157     QByteArray result;
0158     if (headerSize == 0) {
0159         return result;
0160     }
0161
0162     // unfolding skips characters so result will be at worst headerSize long
0163     result.reserve(headerSize);
0164
0165     const char *end = header + headerSize;
0166     const char *pos = header;
0167     const char *foldBegin = nullptr;
0168     const char *foldMid = nullptr;
0169     const char *foldEnd = nullptr;
0170     while ((foldMid = strchr(pos, '\n')) && foldMid < end) {
0171         foldBegin = foldEnd = foldMid;
0172         // find the first space before the line-break
0173         while (foldBegin > header) {
0174             if (!QChar::isSpace(*(foldBegin - 1))) {
0175                 break;
0176             }
0177             --foldBegin;
0178         }
0179         // find the first non-space after the line-break
0180         while (foldEnd <= end - 1) {
0181             if (QChar::isSpace(*foldEnd)) {
0182                 ++foldEnd;
0183             } else if (foldEnd && *(foldEnd - 1) == '\n' &&
0184                        *foldEnd == '=' && foldEnd + 2 < (header + headerSize - 1) &&
0185                        ((*(foldEnd + 1) == '0' &&
0186                          *(foldEnd + 2) == '9') ||
0187                         (*(foldEnd + 1) == '2' &&
0188                          *(foldEnd + 2) == '0'))) {
0189                 // bug #86302: malformed header continuation starting with =09/=20
0190                 foldEnd += 3;
0191             } else {
0192                 break;
0193             }
0194         }
0195
0196         result.append(pos, foldBegin - pos);
0197         if (foldBegin != pos && foldEnd < end - 1) {
0198             result += ' ';
0199         }
0200         pos = foldEnd;
0201     }
0202     if (end > pos) {
0203         result.append(pos, end - pos);
0204     }
0205     return result;
0206 }
0207
0208 QByteArray KMime::unfoldHeader(const QByteArray &header)
0209 {
0210     return unfoldHeader(header.constData(), header.size());
0211 }
0212
0213 namespace {
0214 // state machine used by foldHeader()
0215 struct HeaderContext {
0216     unsigned int isEscapePair : 1;
0217     unsigned int isQuotedStr : 1;
0218
0219     HeaderContext() {
0220         isEscapePair = isQuotedStr = 0;
0221     }
0222
0223     void push(char c) {
0224         if (c == '\"' && !isEscapePair) {
0225             ++isQuotedStr;
0226         } else if (c == '\\' || isEscapePair) {
0227             ++isEscapePair;
0228         }
0229     }
0230 };
0231 }
0232
0233 QByteArray KMime::foldHeader(const QByteArray &header)
0234 {
0235     // RFC 5322 section 2.1.1. "Line Length Limits" says:
0236     //
0237     // "Each line of characters MUST be no more than 998 characters, and
0238     //  SHOULD be no more than 78 characters, excluding the CRLF."
0239     const int maxLen = 78;
0240
0241     if (header.length() <= maxLen) {
0242         return header;
0243     }
0244
0245     // fast forward to header body
0246     int pos = header.indexOf(':') + 1;
0247     if (pos <= 0 || pos >= header.length()) {
0248         return header;
0249     }
0250
0251     // prepare for mutating header
0252     QByteArray hdr = header;
0253
0254     // There are positions that are eligible for inserting FWS but discouraged
0255     // (e.g. existing white space within a quoted string), and there are
0256     // positions which are recommended for inserting FWS (e.g. after comma
0257     // separator of an address list).
0258     int eligible = pos;
0259     int recommended = pos;
0260
0261     // reflects start position of "current line" in byte array
0262     int start = 0;
0263
0264     HeaderContext ctx;
0265
0266     for (; true; ++pos) {
0267         if (pos - start > maxLen && eligible) {
0268             // Fold line preferably at recommended position, at eligible position
0269             // otherwise.
0270             const int fws = recommended ? recommended : eligible;
0271             hdr.insert(fws, '\n');
0272             // We started a new line, so reset.
0273             if (eligible <= fws) {
0274                 eligible = 0;
0275             } else {
0276                 ++eligible; // LF
0277             }
0278             recommended = 0;
0279             start = fws + 1/* LF */;
0280             continue;
0281         }
0282
0283         if (pos >= hdr.length()) {
0284             break;
0285         }
0286
0287         // account for already inserted FWS
0288         // (NOTE: we are not caring about broken ones here)
0289         if (hdr[pos] == '\n') {
0290             recommended = eligible = 0;
0291             start = pos + 1/* LF */;
0292         }
0293
0294         // Any white space character position is eligible for folding, except of
0295         // escape pair (i.e. BSP WSP must not be folded).
0296         if (hdr[pos] == ' ' && !ctx.isEscapePair && hdr[pos - 1] != '\n') {
0297             eligible = pos;
0298             if ((hdr[pos - 1] == ',' || hdr[pos - 1] == ';') && !ctx.isQuotedStr) {
0299                 recommended = pos;
0300             }
0301         }
0302
0303         ctx.push(hdr[pos]);
0304     }
0305
0306     return hdr;
0307 }
0308
0309 namespace
0310 {
0311 template < typename StringType, typename CharType > void removeQuotesGeneric(StringType &str)
0312 {
0313     bool inQuote = false;
0314     for (int i = 0; i < str.length(); ++i) {
0315         if (str[i] == CharType('"')) {
0316             str.remove(i, 1);
0317             i--;
0318             inQuote = !inQuote;
0319         } else {
0320             if (inQuote && (str[i] == CharType('\\'))) {
0321                 str.remove(i, 1);
0322             }
0323         }
0324     }
0325 }
0326 }
0327
0328 void KMime::removeQuotes(QByteArray &str)
0329 {
0330     removeQuotesGeneric<QByteArray, char>(str);
0331 }
0332
0333 void KMime::removeQuotes(QString &str)
0334 {
0335     removeQuotesGeneric<QString, QLatin1Char>(str);
0336 }
0337
0338 namespace {
0339 template<class StringType, class CharConverterType>
0340 void addQuotes_impl(StringType &str, bool forceQuotes)
0341 {
0342     constexpr const char reservedCharacters[] = R"(""(),.:;<=>@[\])"; // sorted!
0343
0344     bool needsQuotes = false;
0345     for (qsizetype i = 0; i < str.length(); i++) {
0346         const auto cur = str.at(i);
0347         const auto it = std::lower_bound(std::begin(reservedCharacters), std::end(reservedCharacters), cur, [](char lhs, auto rhs) {
0348             return CharConverterType(lhs) < rhs;
0349         });
0350         if (it != std::end(reservedCharacters) && CharConverterType(*it) == cur) {
0351             needsQuotes = true;
0352         }
0353         if (cur == CharConverterType('\\') || cur == CharConverterType('\"')) {
0354             str.insert(i, CharConverterType('\\'));
0355             i++;
0356         }
0357     }
0358
0359     if (needsQuotes || forceQuotes) {
0360         str.insert(0, CharConverterType('\"'));
0361         str.append(CharConverterType('\"'));
0362     }
0363 }
0364 }
0365
0366 void KMime::addQuotes(QByteArray &str, bool forceQuotes)
0367 {
0368     addQuotes_impl<QByteArray, char>(str, forceQuotes);
0369 }
0370
0371 void KMime::addQuotes(QString &str, bool forceQuotes)
0372 {
0373     addQuotes_impl<QString, QLatin1Char>(str, forceQuotes);
0374 }
0375
0376 QString KMime::balanceBidiState(const QString &input)
0377 {
0378     const int LRO = 0x202D;
0379     const int RLO = 0x202E;
0380     const int LRE = 0x202A;
0381     const int RLE = 0x202B;
0382     const int PDF = 0x202C;
0383
0384     QString result = input;
0385
0386     int openDirChangers = 0;
0387     int numPDFsRemoved = 0;
0388     for (int i = 0; i < input.length(); i++) {
0389         const ushort &code = input.at(i).unicode();
0390         if (code == LRO || code == RLO || code == LRE || code == RLE) {
0391             openDirChangers++;
0392         } else if (code == PDF) {
0393             if (openDirChangers > 0) {
0394                 openDirChangers--;
0395             } else {
0396                 // One PDF too much, remove it
0397                 qCWarning(KMIME_LOG) << "Possible Unicode spoofing (unexpected PDF) detected in" << input;
0398                 result.remove(i - numPDFsRemoved, 1);
0399                 numPDFsRemoved++;
0400             }
0401         }
0402     }
0403
0404     if (openDirChangers > 0) {
0405         qCWarning(KMIME_LOG) << "Possible Unicode spoofing detected in" << input;
0406
0407         // At PDF chars to the end until the correct state is restored.
0408         // As a special exception, when encountering quoted strings, place the PDF before
0409         // the last quote.
0410         for (int i = openDirChangers; i > 0; i--) {
0411             if (result.endsWith(QLatin1Char('"'))) {
0412                 result.insert(result.length() - 1, QChar(PDF));
0413             } else {
0414                 result += QChar(PDF);
0415             }
0416         }
0417     }
0418
0419     return result;
0420 }
0421
0422 QString KMime::removeBidiControlChars(const QString &input)
0423 {
0424     const int LRO = 0x202D;
0425     const int RLO = 0x202E;
0426     const int LRE = 0x202A;
0427     const int RLE = 0x202B;
0428     QString result = input;
0429     result.remove(QChar(LRO));
0430     result.remove(QChar(RLO));
0431     result.remove(QChar(LRE));
0432     result.remove(QChar(RLE));
0433     return result;
0434 }