kmime/src/kmime_header_parsing.cpp

0001 /*  -*- c++ -*-
0002     kmime_header_parsing.cpp
0003
0004     KMime, the KDE Internet mail/usenet news message library.
0005     SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
0006
0007     SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009
0010 #include "kmime_header_parsing.h"
0011
0012 #include "kmime_headerfactory_p.h"
0013 #include "kmime_headers.h"
0014 #include "kmime_util.h"
0015 #include "kmime_util_p.h"
0016 #include "kmime_codecs_p.h"
0017 #include "kmime_dateformatter.h"
0018 #include "kmime_debug.h"
0019 #include "kmime_warning_p.h"
0020
0021 #include <KCodecs>
0022
0023 #include <QMap>
0024 #include <QStringDecoder>
0025 #include <QTimeZone>
0026
0027 #include <cassert>
0028 #include <cctype> // for isdigit
0029
0030 using namespace KMime;
0031 using namespace KMime::Types;
0032
0033 namespace KMime
0034 {
0035
0036     namespace Types
0037     {
0038         // Optimization to avoid allocating QStrings when the value isn't encoded
0039         struct KMIME_EXPORT QStringOrQPair {
0040             QStringOrQPair() : qstring(), qpair(nullptr, 0) {}
0041             QString qstring;
0042             QPair<const char *, int> qpair;
0043         };
0044     } // namespace Types
0045
0046 namespace HeaderParsing
0047 {
0048
0049 // parse the encoded-word (scursor points to after the initial '=')
0050 bool parseEncodedWord(const char *&scursor, const char *const send,
0051                       QString &result, QByteArray &language,
0052                       QByteArray &usedCS, const QByteArray &defaultCS)
0053 {
0054     // make sure the caller already did a bit of the work.
0055     assert(*(scursor - 1) == '=');
0056
0057     //
0058     // STEP 1:
0059     // scan for the charset/language portion of the encoded-word
0060     //
0061
0062     char ch = *scursor++;
0063
0064     if (ch != '?') {
0065         // qCDebug(KMIME_LOG) << "first";
0066         //KMIME_WARN_PREMATURE_END_OF( EncodedWord );
0067         return false;
0068     }
0069
0070     // remember start of charset (ie. just after the initial "=?") and
0071     // language (just after the first '*') fields:
0072     const char *charsetStart = scursor;
0073     const char *languageStart = nullptr;
0074
0075     // find delimiting '?' (and the '*' separating charset and language
0076     // tags, if any):
0077     for (; scursor != send ; scursor++) {
0078         if (*scursor == '?') {
0079             break;
0080         } else if (*scursor == '*' && languageStart == nullptr) {
0081             languageStart = scursor + 1;
0082         }
0083     }
0084
0085     // not found? can't be an encoded-word!
0086     if (scursor == send || *scursor != '?') {
0087         // qCDebug(KMIME_LOG) << "second";
0088         KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0089         return false;
0090     }
0091
0092     // extract the language information, if any (if languageStart is 0,
0093     // language will be null, too):
0094     QByteArray maybeLanguage(languageStart, scursor - languageStart);
0095     // extract charset information (keep in mind: the size given to the
0096     // ctor is one off due to the \0 terminator):
0097     QByteArray maybeCharset(charsetStart,
0098                             (languageStart ? languageStart - 1 : scursor) - charsetStart);
0099
0100     //
0101     // STEP 2:
0102     // scan for the encoding portion of the encoded-word
0103     //
0104
0105     // remember start of encoding (just _after_ the second '?'):
0106     scursor++;
0107     const char *encodingStart = scursor;
0108
0109     // find next '?' (ending the encoding tag):
0110     for (; scursor != send ; scursor++) {
0111         if (*scursor == '?') {
0112             break;
0113         }
0114     }
0115
0116     // not found? Can't be an encoded-word!
0117     if (scursor == send || *scursor != '?') {
0118         // qCDebug(KMIME_LOG) << "third";
0119         KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0120         return false;
0121     }
0122
0123     // extract the encoding information:
0124     QByteArray maybeEncoding(encodingStart, scursor - encodingStart);
0125
0126     // qCDebug(KMIME_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
0127     //         << "\"; language == \"" << maybeLanguage
0128     //         << "\"; encoding == \"" << maybeEncoding << "\"";
0129
0130     //
0131     // STEP 3:
0132     // scan for encoded-text portion of encoded-word
0133     //
0134
0135     // remember start of encoded-text (just after the third '?'):
0136     scursor++;
0137     const char *encodedTextStart = scursor;
0138
0139     // find the '?=' sequence (ending the encoded-text):
0140     for (; scursor != send ; scursor++) {
0141         if (*scursor == '?') {
0142             if (scursor + 1 != send) {
0143                 if (*(scursor + 1) != '=') {     // We expect a '=' after the '?', but we got something else; ignore
0144                     KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
0145                     continue;
0146                 } else { // yep, found a '?=' sequence
0147                     scursor += 2;
0148                     break;
0149                 }
0150             } else { // The '?' is the last char, but we need a '=' after it!
0151                 KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0152                 return false;
0153             }
0154         }
0155     }
0156
0157     if (*(scursor - 2) != '?' || *(scursor - 1) != '=' ||
0158             scursor < encodedTextStart + 2) {
0159         KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0160         return false;
0161     }
0162
0163     // set end sentinel for encoded-text:
0164     const char *const encodedTextEnd = scursor - 2;
0165
0166     //
0167     // STEP 4:
0168     // setup decoders for the transfer encoding and the charset
0169     //
0170
0171     // try if there's a codec for the encoding found:
0172     KCodecs::Codec *codec = KCodecs::Codec::codecForName(maybeEncoding);
0173     if (!codec) {
0174         KMIME_WARN_UNKNOWN(Encoding, maybeEncoding);
0175         return false;
0176     }
0177
0178     // get an instance of a corresponding decoder:
0179     KCodecs::Decoder *dec = codec->makeDecoder();
0180     assert(dec);
0181
0182     // try if there's a (text)codec for the charset found:
0183     QStringDecoder textCodec;
0184     if (maybeCharset.isEmpty()) {
0185         textCodec = QStringDecoder(defaultCS.constData());
0186         if (!textCodec.isValid()) {
0187             textCodec = QStringDecoder(QStringDecoder::Latin1);
0188         }
0189         usedCS = cachedCharset(defaultCS);
0190     } else {
0191         textCodec = QStringDecoder(maybeCharset.constData());
0192         if (textCodec.isValid()) {    //no suitable codec found => use default charset
0193             usedCS = cachedCharset(defaultCS);
0194         } else {
0195             textCodec = QStringDecoder(QStringDecoder::Latin1);
0196             usedCS = cachedCharset(maybeCharset);
0197         }
0198     }
0199
0200     if (!textCodec.isValid()) {
0201         KMIME_WARN_UNKNOWN(Charset, maybeCharset);
0202         delete dec;
0203         return false;
0204     };
0205
0206     // qCDebug(KMIME_LOG) << "mimeName(): \"" << textCodec->name() << "\"";
0207
0208     // allocate a temporary buffer to store the 8bit text:
0209     int encodedTextLength = encodedTextEnd - encodedTextStart;
0210     QByteArray buffer;
0211     buffer.resize(codec->maxDecodedSizeFor(encodedTextLength));
0212     char *bbegin = buffer.data();
0213     char *bend = bbegin + buffer.length();
0214
0215     //
0216     // STEP 5:
0217     // do the actual decoding
0218     //
0219
0220     if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) {
0221         KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
0222                    << encodedTextLength << ")\nresult may be truncated";
0223     }
0224
0225     result = textCodec.decode(QByteArrayView(buffer.data(), bbegin - buffer.data()));
0226
0227     // qCDebug(KMIME_LOG) << "result now: \"" << result << "\"";
0228     // cleanup:
0229     delete dec;
0230     language = maybeLanguage;
0231
0232     return true;
0233 }
0234
0235 static inline void eatWhiteSpace(const char *&scursor, const char *const send)
0236 {
0237     while (scursor != send &&
0238             (*scursor == ' ' || *scursor == '\n' ||
0239              *scursor == '\t' || *scursor == '\r')) {
0240         scursor++;
0241     }
0242 }
0243
0244 bool parseAtom(const char*&scursor, const char *const send,
0245                QByteArray &result, bool allow8Bit)
0246 {
0247     QPair<const char *, int> maybeResult;
0248
0249     if (parseAtom(scursor, send, maybeResult, allow8Bit)) {
0250         result = QByteArray(maybeResult.first, maybeResult.second);
0251         return true;
0252     }
0253
0254     return false;
0255 }
0256
0257 bool parseAtom(const char*&scursor, const char *const send,
0258                QPair<const char *, int> &result, bool allow8Bit)
0259 {
0260     bool success = false;
0261     const char *start = scursor;
0262
0263     while (scursor != send) {
0264         signed char ch = *scursor++;
0265         if (ch > 0 && isAText(ch)) {
0266             // AText: OK
0267             success = true;
0268         } else if (allow8Bit && ch < 0) {
0269             // 8bit char: not OK, but be tolerant.
0270             KMIME_WARN_8BIT(ch);
0271             success = true;
0272         } else {
0273             // CTL or special - marking the end of the atom:
0274             // re-set sursor to point to the offending
0275             // char and return:
0276             scursor--;
0277             break;
0278         }
0279     }
0280     result.first = start;
0281     result.second = scursor - start;
0282     return success;
0283 }
0284
0285 bool parseToken(const char*&scursor, const char *const send,
0286                 QByteArray &result, ParseTokenFlags flags)
0287 {
0288     QPair<const char *, int> maybeResult;
0289
0290     if (parseToken(scursor, send, maybeResult, flags)) {
0291         result = QByteArray(maybeResult.first, maybeResult.second);
0292         return true;
0293     }
0294
0295     return false;
0296 }
0297
0298 bool parseToken(const char*&scursor, const char *const send,
0299                 QPair<const char *, int> &result, ParseTokenFlags flags)
0300 {
0301     bool success = false;
0302     const char *start = scursor;
0303
0304     while (scursor != send) {
0305         signed char ch = *scursor++;
0306         if (ch > 0 && isTText(ch)) {
0307             // TText: OK
0308             success = true;
0309         } else if ((flags & ParseTokenAllow8Bit) && ch < 0) {
0310             // 8bit char: not OK, but be tolerant.
0311             KMIME_WARN_8BIT(ch);
0312             success = true;
0313         } else if ((flags & ParseTokenRelaxedTText) && ch == '/') {
0314             success = true;
0315         } else {
0316             // CTL or tspecial - marking the end of the atom:
0317             // re-set sursor to point to the offending
0318             // char and return:
0319             scursor--;
0320             break;
0321         }
0322     }
0323     result.first = start;
0324     result.second = scursor - start;
0325     return success;
0326 }
0327
0328 #define READ_ch_OR_FAIL if ( scursor == send ) {        \
0329         KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
0330         return false;                                       \
0331     } else {                                              \
0332         ch = *scursor++;                                    \
0333     }
0334
0335 // known issues:
0336 //
0337 // - doesn't handle quoted CRLF
0338
0339 bool parseGenericQuotedString(const char *&scursor, const char *const send,
0340                               QString &result, bool isCRLF,
0341                               const char openChar, const char closeChar)
0342 {
0343     // We are in a quoted-string or domain-literal or comment and the
0344     // cursor points to the first char after the openChar.
0345     // We will apply unfolding and quoted-pair removal.
0346     // We return when we either encounter the end or unescaped openChar
0347     // or closeChar.
0348     assert(*(scursor - 1) == openChar || *(scursor - 1) == closeChar);
0349
0350     while (scursor != send) {
0351         char ch = *scursor++;
0352
0353         if (ch == closeChar || ch == openChar) {
0354             // end of quoted-string or another opening char:
0355             // let caller decide what to do.
0356             return true;
0357         }
0358
0359         switch (ch) {
0360         case '\\':      // quoted-pair
0361             // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
0362             READ_ch_OR_FAIL;
0363             KMIME_WARN_IF_8BIT(ch);
0364             result += QLatin1Char(ch);
0365             break;
0366         case '\r':
0367             // ###
0368             // The case of lonely '\r' is easy to solve, as they're
0369             // not part of Unix Line-ending conventions.
0370             // But I see a problem if we are given Unix-native
0371             // line-ending-mails, where we cannot determine anymore
0372             // whether a given '\n' was part of a CRLF or was occurring
0373             // on it's own.
0374             READ_ch_OR_FAIL;
0375             if (ch != '\n') {
0376                 // CR on it's own...
0377                 KMIME_WARN_LONE(CR);
0378                 result += QLatin1Char('\r');
0379                 scursor--; // points to after the '\r' again
0380             } else {
0381                 // CRLF encountered.
0382                 // lookahead: check for folding
0383                 READ_ch_OR_FAIL;
0384                 if (ch == ' ' || ch == '\t') {
0385                     // correct folding;
0386                     // position cursor behind the CRLF WSP (unfolding)
0387                     // and add the WSP to the result
0388                     result += QLatin1Char(ch);
0389                 } else {
0390                     // this is the "shouldn't happen"-case. There is a CRLF
0391                     // inside a quoted-string without it being part of FWS.
0392                     // We take it verbatim.
0393                     KMIME_WARN_NON_FOLDING(CRLF);
0394                     result += QLatin1StringView("\r\n");
0395                     // the cursor is decremented again, so's we need not
0396                     // duplicate the whole switch here. "ch" could've been
0397                     // everything (incl. openChar or closeChar).
0398                     scursor--;
0399                 }
0400             }
0401             break;
0402         case '\n':
0403             // Note: CRLF has been handled above already!
0404             // ### LF needs special treatment, depending on whether isCRLF
0405             // is true (we can be sure a lonely '\n' was meant this way) or
0406             // false ('\n' alone could have meant LF or CRLF in the original
0407             // message. This parser assumes CRLF iff the LF is followed by
0408             // either WSP (folding) or NULL (premature end of quoted-string;
0409             // Should be fixed, since NULL is allowed as per rfc822).
0410             READ_ch_OR_FAIL;
0411             if (!isCRLF && (ch == ' ' || ch == '\t')) {
0412                 // folding
0413                 // correct folding
0414                 result += QLatin1Char(ch);
0415             } else {
0416                 // non-folding
0417                 KMIME_WARN_LONE(LF);
0418                 result += QLatin1Char('\n');
0419                 // pos is decremented, so's we need not duplicate the whole
0420                 // switch here. ch could've been everything (incl. <">, "\").
0421                 scursor--;
0422             }
0423             break;
0424         case '=': {
0425             // ### Work around broken clients that send encoded words in quoted-strings
0426             //     For example, older KMail versions.
0427             if (scursor == send) {
0428                 break;
0429             }
0430
0431             const char *oldscursor = scursor;
0432             QString tmp;
0433             QByteArray lang;
0434             QByteArray charset;
0435             if (*scursor++ == '?') {
0436                 --scursor;
0437                 if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
0438                     result += tmp;
0439                     //qDebug() << " tmp " << tmp;
0440                     if (scursor == send) {
0441                         break;
0442                     } else if (*scursor++ == ' ') { //Workaround Bug 362650 thunderbird add space for each new line
0443                         if (scursor == send) {
0444                             --scursor;
0445                             break;
0446                         } else if (*scursor++ == '=') {
0447                             if (scursor == send) {
0448                                 --scursor;
0449                                 --scursor;
0450                                 break;
0451                             } else if (*scursor++ == '?') {
0452                                 --scursor;
0453                                 --scursor;
0454                                 break;
0455                             }
0456                         } else {
0457                             --scursor;
0458                             --scursor;
0459                         }
0460                     } else {
0461                         --scursor;
0462                     }
0463
0464                     break;
0465                 } else {
0466                     scursor = oldscursor;
0467                 }
0468             } else {
0469                 scursor = oldscursor;
0470             }
0471             // fall through
0472             [[fallthrough]];
0473         }
0474         default:
0475             KMIME_WARN_IF_8BIT(ch);
0476             result += QLatin1Char(ch);
0477         }
0478     }
0479
0480     return false;
0481 }
0482
0483 // known issues:
0484 //
0485 // - doesn't handle encoded-word inside comments.
0486
0487 bool parseComment(const char *&scursor, const char *const send,
0488                   QString &result, bool isCRLF, bool reallySave)
0489 {
0490     int commentNestingDepth = 1;
0491     const char *afterLastClosingParenPos = nullptr;
0492     QString maybeCmnt;
0493     const char *oldscursor = scursor;
0494
0495     assert(*(scursor - 1) == '(');
0496
0497     while (commentNestingDepth) {
0498         QString cmntPart;
0499         if (parseGenericQuotedString(scursor, send, cmntPart, isCRLF, '(', ')')) {
0500             assert(*(scursor - 1) == ')' || *(scursor - 1) == '(');
0501             // see the kdoc for above function for the possible conditions
0502             // we have to check:
0503             switch (*(scursor - 1)) {
0504             case ')':
0505                 if (reallySave) {
0506                     // add the chunk that's now surely inside the comment.
0507                     result += maybeCmnt;
0508                     result += cmntPart;
0509                     if (commentNestingDepth > 1) {
0510                         // don't add the outermost ')'...
0511                         result += QLatin1Char(')');
0512                     }
0513                     maybeCmnt.clear();
0514                 }
0515                 afterLastClosingParenPos = scursor;
0516                 --commentNestingDepth;
0517                 break;
0518             case '(':
0519                 if (reallySave) {
0520                     // don't add to "result" yet, because we might find that we
0521                     // are already outside the (broken) comment...
0522                     maybeCmnt += cmntPart;
0523                     maybeCmnt += QLatin1Char('(');
0524                 }
0525                 ++commentNestingDepth;
0526                 break;
0527             default: assert(0);
0528             } // switch
0529         } else {
0530             // !parseGenericQuotedString, ie. premature end
0531             if (afterLastClosingParenPos) {
0532                 scursor = afterLastClosingParenPos;
0533             } else {
0534                 scursor = oldscursor;
0535             }
0536             return false;
0537         }
0538     } // while
0539
0540     return true;
0541 }
0542
0543 // known issues: none.
0544
0545 bool parsePhrase(const char *&scursor, const char *const send,
0546                  QString &result, bool isCRLF)
0547 {
0548     enum {
0549         None, Phrase, Atom, EncodedWord, QuotedString
0550     } found = None;
0551
0552     QString tmp;
0553     QByteArray lang;
0554     QByteArray charset;
0555     QPair<const char *, int> tmpAtom;
0556     const char *successfullyParsed = nullptr;
0557     // only used by the encoded-word branch
0558     const char *oldscursor;
0559     // used to suppress whitespace between adjacent encoded-words
0560     // (rfc2047, 6.2):
0561     bool lastWasEncodedWord = false;
0562
0563     while (scursor != send) {
0564         char ch = *scursor++;
0565         switch (ch) {
0566         case '.': // broken, but allow for intorop's sake
0567             if (found == None) {
0568                 --scursor;
0569                 return false;
0570             } else {
0571                 if (scursor != send && (*scursor == ' ' || *scursor == '\t')) {
0572                   result += QLatin1StringView(". ");
0573                 } else {
0574                     result += QLatin1Char('.');
0575                 }
0576                 successfullyParsed = scursor;
0577             }
0578             break;
0579         case '"': // quoted-string
0580             tmp.clear();
0581             if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
0582                 successfullyParsed = scursor;
0583                 assert(*(scursor - 1) == '"');
0584                 switch (found) {
0585                 case None:
0586                     found = QuotedString;
0587                     break;
0588                 case Phrase:
0589                 case Atom:
0590                 case EncodedWord:
0591                 case QuotedString:
0592                     found = Phrase;
0593                     result += QLatin1Char(' ');   // rfc822, 3.4.4
0594                     break;
0595                 default:
0596                     assert(0);
0597                 }
0598                 lastWasEncodedWord = false;
0599                 result += tmp;
0600             } else {
0601                 // premature end of quoted string.
0602                 // What to do? Return leading '"' as special? Return as quoted-string?
0603                 // We do the latter if we already found something, else signal failure.
0604                 if (found == None) {
0605                     return false;
0606                 } else {
0607                     result += QLatin1Char(' ');   // rfc822, 3.4.4
0608                     result += tmp;
0609                     return true;
0610                 }
0611             }
0612             break;
0613         case '(': // comment
0614             // parse it, but ignore content:
0615             tmp.clear();
0616             if (parseComment(scursor, send, tmp, isCRLF,
0617                              false /*don't bother with the content*/)) {
0618                 successfullyParsed = scursor;
0619                 lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
0620             } else {
0621                 if (found == None) {
0622                     return false;
0623                 } else {
0624                     scursor = successfullyParsed;
0625                     return true;
0626                 }
0627             }
0628             break;
0629         case '=': // encoded-word
0630             tmp.clear();
0631             oldscursor = scursor;
0632             lang.clear();
0633             charset.clear();
0634             if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
0635                 successfullyParsed = scursor;
0636                 switch (found) {
0637                 case None:
0638                     found = EncodedWord;
0639                     break;
0640                 case Phrase:
0641                 case EncodedWord:
0642                 case Atom:
0643                 case QuotedString:
0644                     if (!lastWasEncodedWord) {
0645                         result += QLatin1Char(' ');   // rfc822, 3.4.4
0646                     }
0647                     found = Phrase;
0648                     break;
0649                 default: assert(0);
0650                 }
0651                 lastWasEncodedWord = true;
0652                 result += tmp;
0653                 break;
0654             } else {
0655                 // parse as atom:
0656                 scursor = oldscursor;
0657             }
0658             [[fallthrough]];
0659             // fall though...
0660
0661         default: //atom
0662             scursor--;
0663             if (parseAtom(scursor, send, tmpAtom, true /* allow 8bit */)) {
0664                 successfullyParsed = scursor;
0665                 switch (found) {
0666                 case None:
0667                     found = Atom;
0668                     break;
0669                 case Phrase:
0670                 case Atom:
0671                 case EncodedWord:
0672                 case QuotedString:
0673                     found = Phrase;
0674                     result += QLatin1Char(' ');   // rfc822, 3.4.4
0675                     break;
0676                 default:
0677                     assert(0);
0678                 }
0679                 lastWasEncodedWord = false;
0680                 result += QLatin1StringView(tmpAtom.first, tmpAtom.second);
0681             } else {
0682                 if (found == None) {
0683                     return false;
0684                 } else {
0685                     scursor = successfullyParsed;
0686                     return true;
0687                 }
0688             }
0689         }
0690         eatWhiteSpace(scursor, send);
0691     }
0692
0693     return found != None;
0694 }
0695
0696 bool parseDotAtom(const char *&scursor, const char *const send,
0697                   QByteArray &result, bool isCRLF)
0698 {
0699     eatCFWS(scursor, send, isCRLF);
0700
0701     // always points to just after the last atom parsed:
0702     const char *successfullyParsed;
0703
0704     QByteArray tmp;
0705     if (!parseAtom(scursor, send, tmp, false /* no 8bit */)) {
0706         return false;
0707     }
0708     result += tmp;
0709     successfullyParsed = scursor;
0710
0711     while (scursor != send) {
0712
0713         // end of header or no '.' -> return
0714         if (scursor == send || *scursor != '.') {
0715             return true;
0716         }
0717         scursor++; // eat '.'
0718
0719         if (scursor == send || !isAText(*scursor)) {
0720             // end of header or no AText, but this time following a '.'!:
0721             // reset cursor to just after last successfully parsed char and
0722             // return:
0723             scursor = successfullyParsed;
0724             return true;
0725         }
0726
0727         // try to parse the next atom:
0728         QByteArray maybeAtom;
0729         if (!parseAtom(scursor, send, maybeAtom, false /*no 8bit*/)) {
0730             scursor = successfullyParsed;
0731             return true;
0732         }
0733
0734         result += '.';
0735         result += maybeAtom;
0736         successfullyParsed = scursor;
0737     }
0738
0739     scursor = successfullyParsed;
0740     return true;
0741 }
0742
0743 void eatCFWS(const char *&scursor, const char *const send, bool isCRLF)
0744 {
0745     QString dummy;
0746
0747     while (scursor != send) {
0748         const char *oldscursor = scursor;
0749
0750         char ch = *scursor++;
0751
0752         switch (ch) {
0753         case ' ':
0754         case '\t': // whitespace
0755         case '\r':
0756         case '\n': // folding
0757             continue;
0758
0759         case '(': // comment
0760             if (parseComment(scursor, send, dummy, isCRLF, false /*don't save*/)) {
0761                 continue;
0762             }
0763             scursor = oldscursor;
0764             return;
0765
0766         default:
0767             scursor = oldscursor;
0768             return;
0769         }
0770     }
0771 }
0772
0773 bool parseDomain(const char *&scursor, const char *const send,
0774                  QString &result, bool isCRLF)
0775 {
0776     eatCFWS(scursor, send, isCRLF);
0777     if (scursor == send) {
0778         return false;
0779     }
0780
0781     // domain := dot-atom / domain-literal / atom *("." atom)
0782     //
0783     // equivalent to:
0784     // domain = dot-atom / domain-literal,
0785     // since parseDotAtom does allow CFWS between atoms and dots
0786
0787     if (*scursor == '[') {
0788         // domain-literal:
0789         QString maybeDomainLiteral;
0790         // eat '[':
0791         scursor++;
0792         while (parseGenericQuotedString(scursor, send, maybeDomainLiteral,
0793                                         isCRLF, '[', ']')) {
0794             if (scursor == send) {
0795                 // end of header: check for closing ']':
0796                 if (*(scursor - 1) == ']') {
0797                     // OK, last char was ']':
0798                     result = maybeDomainLiteral;
0799                     return true;
0800                 } else {
0801                     // not OK, domain-literal wasn't closed:
0802                     return false;
0803                 }
0804             }
0805             // we hit openChar in parseGenericQuotedString.
0806             // include it in maybeDomainLiteral and keep on parsing:
0807             if (*(scursor - 1) == '[') {
0808                 maybeDomainLiteral += QLatin1Char('[');
0809                 continue;
0810             }
0811             // OK, real end of domain-literal:
0812             result = maybeDomainLiteral;
0813             return true;
0814         }
0815     } else {
0816         // dot-atom:
0817         QByteArray maybeDotAtom;
0818         if (parseDotAtom(scursor, send, maybeDotAtom, isCRLF)) {
0819             // Domain may end with '.', if so preserve it'
0820             if (scursor != send && *scursor == '.') {
0821                 maybeDotAtom += '.';
0822                 scursor++;
0823             }
0824             result = QString::fromLatin1(maybeDotAtom);
0825             return true;
0826         }
0827     }
0828     return false;
0829 }
0830
0831 bool parseObsRoute(const char *&scursor, const char *const send,
0832                    QStringList &result, bool isCRLF, bool save)
0833 {
0834     while (scursor != send) {
0835         eatCFWS(scursor, send, isCRLF);
0836         if (scursor == send) {
0837             return false;
0838         }
0839
0840         // empty entry:
0841         if (*scursor == ',') {
0842             scursor++;
0843             if (save) {
0844                 result.append(QString());
0845             }
0846             continue;
0847         }
0848
0849         // empty entry ending the list:
0850         if (*scursor == ':') {
0851             scursor++;
0852             if (save) {
0853                 result.append(QString());
0854             }
0855             return true;
0856         }
0857
0858         // each non-empty entry must begin with '@':
0859         if (*scursor != '@') {
0860             return false;
0861         } else {
0862             scursor++;
0863         }
0864
0865         QString maybeDomain;
0866         if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
0867             return false;
0868         }
0869         if (save) {
0870             result.append(maybeDomain);
0871         }
0872
0873         // eat the following (optional) comma:
0874         eatCFWS(scursor, send, isCRLF);
0875         if (scursor == send) {
0876             return false;
0877         }
0878         if (*scursor == ':') {
0879             scursor++;
0880             return true;
0881         }
0882         if (*scursor == ',') {
0883             scursor++;
0884         }
0885     }
0886
0887     return false;
0888 }
0889
0890 bool parseAddrSpec(const char *&scursor, const char *const send,
0891                    AddrSpec &result, bool isCRLF)
0892 {
0893     //
0894     // STEP 1:
0895     // local-part := dot-atom / quoted-string / word *("." word)
0896     //
0897     // this is equivalent to:
0898     // local-part := word *("." word)
0899
0900     QString maybeLocalPart;
0901     QString tmp;
0902     QPair<const char *, int> tmpAtom;
0903
0904     while (scursor != send) {
0905         // first, eat any whitespace
0906         eatCFWS(scursor, send, isCRLF);
0907
0908         char ch = *scursor++;
0909         switch (ch) {
0910         case '.': // dot
0911             maybeLocalPart += QLatin1Char('.');
0912             break;
0913
0914         case '@':
0915             goto SAW_AT_SIGN;
0916             break;
0917
0918         case '"': // quoted-string
0919             tmp.clear();
0920             if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
0921                 maybeLocalPart += tmp;
0922             } else {
0923                 return false;
0924             }
0925             break;
0926
0927         default: // atom
0928             scursor--; // re-set scursor to point to ch again
0929             if (parseAtom(scursor, send, tmpAtom, false /* no 8bit */)) {
0930               maybeLocalPart +=
0931                   QLatin1StringView(tmpAtom.first, tmpAtom.second);
0932             } else {
0933                 return false; // parseAtom can only fail if the first char is non-atext.
0934             }
0935             break;
0936         }
0937     }
0938
0939     return false;
0940
0941     //
0942     // STEP 2:
0943     // domain
0944     //
0945
0946 SAW_AT_SIGN:
0947
0948     assert(*(scursor - 1) == '@');
0949
0950     QString maybeDomain;
0951     if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
0952         return false;
0953     }
0954
0955     result.localPart = maybeLocalPart;
0956     result.domain = maybeDomain;
0957
0958     return true;
0959 }
0960
0961 bool parseAngleAddr(const char *&scursor, const char *const send,
0962                     AddrSpec &result, bool isCRLF)
0963 {
0964     // first, we need an opening angle bracket:
0965     eatCFWS(scursor, send, isCRLF);
0966     if (scursor == send || *scursor != '<') {
0967         return false;
0968     }
0969     scursor++; // eat '<'
0970
0971     eatCFWS(scursor, send, isCRLF);
0972     if (scursor == send) {
0973         return false;
0974     }
0975
0976     if (*scursor == '@' || *scursor == ',') {
0977         // obs-route: parse, but ignore:
0978         KMIME_WARN << "obsolete source route found! ignoring.";
0979         QStringList dummy;
0980         if (!parseObsRoute(scursor, send, dummy,
0981                            isCRLF, false /* don't save */)) {
0982             return false;
0983         }
0984         // angle-addr isn't complete until after the '>':
0985         if (scursor == send) {
0986             return false;
0987         }
0988     }
0989
0990     // parse addr-spec:
0991     AddrSpec maybeAddrSpec;
0992     if (!parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
0993         return false;
0994     }
0995
0996     eatCFWS(scursor, send, isCRLF);
0997     if (scursor == send || *scursor != '>') {
0998         return false;
0999     }
1000     scursor++;
1001
1002     result = maybeAddrSpec;
1003     return true;
1004
1005 }
1006
1007 static QString stripQuotes(const QString &input)
1008 {
1009     const QLatin1Char quotes('"');
1010     if (input.startsWith(quotes) && input.endsWith(quotes)) {
1011         QString stripped(input.mid(1, input.size() - 2));
1012         return stripped;
1013     } else {
1014         return input;
1015     }
1016 }
1017
1018 bool parseMailbox(const char *&scursor, const char *const send,
1019                   Mailbox &result, bool isCRLF)
1020 {
1021     eatCFWS(scursor, send, isCRLF);
1022     if (scursor == send) {
1023         return false;
1024     }
1025
1026     AddrSpec maybeAddrSpec;
1027     QString maybeDisplayName;
1028
1029     // first, try if it's a vanilla addr-spec:
1030     const char *oldscursor = scursor;
1031     if (parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
1032         result.setAddress(maybeAddrSpec);
1033         // check for the obsolete form of display-name (as comment):
1034         eatWhiteSpace(scursor, send);
1035         if (scursor != send && *scursor == '(') {
1036             scursor++;
1037             if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1038                 return false;
1039             }
1040         }
1041         result.setName(stripQuotes(maybeDisplayName));
1042         return true;
1043     }
1044     scursor = oldscursor;
1045
1046     // second, see if there's a display-name:
1047     if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1048         // failed: reset cursor, note absent display-name
1049         maybeDisplayName.clear();
1050         scursor = oldscursor;
1051     } else {
1052         // succeeded: eat CFWS
1053         eatCFWS(scursor, send, isCRLF);
1054         if (scursor == send) {
1055             return false;
1056         }
1057     }
1058
1059     // third, parse the angle-addr:
1060     if (!parseAngleAddr(scursor, send, maybeAddrSpec, isCRLF)) {
1061         return false;
1062     }
1063
1064     if (maybeDisplayName.isNull()) {
1065         // check for the obsolete form of display-name (as comment):
1066         eatWhiteSpace(scursor, send);
1067         if (scursor != send && *scursor == '(') {
1068             scursor++;
1069             if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1070                 return false;
1071             }
1072         }
1073     }
1074
1075     result.setName(stripQuotes(maybeDisplayName));
1076     result.setAddress(maybeAddrSpec);
1077     return true;
1078 }
1079
1080 bool parseGroup(const char *&scursor, const char *const send,
1081                 Address &result, bool isCRLF)
1082 {
1083     // group         := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
1084     //
1085     // equivalent to:
1086     // group   := display-name ":" [ obs-mbox-list ] ";"
1087
1088     eatCFWS(scursor, send, isCRLF);
1089     if (scursor == send) {
1090         return false;
1091     }
1092
1093     // get display-name:
1094     QString maybeDisplayName;
1095     if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1096         return false;
1097     }
1098
1099     // get ":":
1100     eatCFWS(scursor, send, isCRLF);
1101     if (scursor == send || *scursor != ':') {
1102         return false;
1103     }
1104
1105     // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
1106     //            automatically calls removeBidiControlChars
1107     result.displayName = removeBidiControlChars(maybeDisplayName);
1108
1109     // get obs-mbox-list (may contain empty entries):
1110     scursor++;
1111     while (scursor != send) {
1112         eatCFWS(scursor, send, isCRLF);
1113         if (scursor == send) {
1114             return false;
1115         }
1116
1117         // empty entry:
1118         if (*scursor == ',') {
1119             scursor++;
1120             continue;
1121         }
1122
1123         // empty entry ending the list:
1124         if (*scursor == ';') {
1125             scursor++;
1126             return true;
1127         }
1128
1129         Mailbox maybeMailbox;
1130         if (!parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1131             return false;
1132         }
1133         result.mailboxList.append(maybeMailbox);
1134
1135         eatCFWS(scursor, send, isCRLF);
1136         // premature end:
1137         if (scursor == send) {
1138             return false;
1139         }
1140         // regular end of the list:
1141         if (*scursor == ';') {
1142             scursor++;
1143             return true;
1144         }
1145         // eat regular list entry separator:
1146         if (*scursor == ',') {
1147             scursor++;
1148         }
1149     }
1150     return false;
1151 }
1152
1153 bool parseAddress(const char *&scursor, const char *const send,
1154                   Address &result, bool isCRLF)
1155 {
1156     // address       := mailbox / group
1157
1158     eatCFWS(scursor, send, isCRLF);
1159     if (scursor == send) {
1160         return false;
1161     }
1162
1163     // first try if it's a single mailbox:
1164     Mailbox maybeMailbox;
1165     const char *oldscursor = scursor;
1166     if (parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1167         // yes, it is:
1168         result.displayName.clear();
1169         result.mailboxList.append(maybeMailbox);
1170         return true;
1171     }
1172     scursor = oldscursor;
1173
1174     Address maybeAddress;
1175
1176     // no, it's not a single mailbox. Try if it's a group:
1177     if (!parseGroup(scursor, send, maybeAddress, isCRLF)) {
1178         return false;
1179     }
1180
1181     result = maybeAddress;
1182     return true;
1183 }
1184
1185 bool parseAddressList(const char *&scursor, const char *const send,
1186                       AddressList &result, bool isCRLF)
1187 {
1188     while (scursor != send) {
1189         eatCFWS(scursor, send, isCRLF);
1190         // end of header: this is OK.
1191         if (scursor == send) {
1192             return true;
1193         }
1194         // empty entry: ignore:
1195         if (*scursor == ',') {
1196             scursor++;
1197             continue;
1198         }
1199         // broken clients might use ';' as list delimiter, accept that as well
1200         if (*scursor == ';') {
1201             scursor++;
1202             continue;
1203         }
1204
1205         // parse one entry
1206         Address maybeAddress;
1207         if (!parseAddress(scursor, send, maybeAddress, isCRLF)) {
1208             return false;
1209         }
1210         result.append(maybeAddress);
1211
1212         eatCFWS(scursor, send, isCRLF);
1213         // end of header: this is OK.
1214         if (scursor == send) {
1215             return true;
1216         }
1217         // comma separating entries: eat it.
1218         if (*scursor == ',') {
1219             scursor++;
1220         }
1221     }
1222     return true;
1223 }
1224
1225 static bool parseParameter(const char *&scursor, const char *const send,
1226                            QPair<QString, QStringOrQPair> &result, bool isCRLF)
1227 {
1228     // parameter = regular-parameter / extended-parameter
1229     // regular-parameter = regular-parameter-name "=" value
1230     // extended-parameter =
1231     // value = token / quoted-string
1232     //
1233     // note that rfc2231 handling is out of the scope of this function.
1234     // Therefore we return the attribute as QByteArray and the value as
1235     // (start,length) tuple if we see that the value is encoded
1236     // (trailing asterisk), for parseParameterList to decode...
1237
1238     eatCFWS(scursor, send, isCRLF);
1239     if (scursor == send) {
1240         return false;
1241     }
1242
1243     //
1244     // parse the parameter name:
1245     //
1246     QByteArray tmpAttr;
1247     if (!parseToken(scursor, send, tmpAttr, ParseTokenNoFlag)) {
1248         return false;
1249     }
1250     // FIXME: we could use QMap<QByteArray, ...> in the API for parameters
1251     QString maybeAttribute = QString::fromLatin1(tmpAttr);
1252
1253     eatCFWS(scursor, send, isCRLF);
1254     // premature end: not OK (haven't seen '=' yet).
1255     if (scursor == send || *scursor != '=') {
1256         return false;
1257     }
1258     scursor++; // eat '='
1259
1260     eatCFWS(scursor, send, isCRLF);
1261     if (scursor == send) {
1262         // don't choke on attribute=, meaning the value was omitted:
1263         if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1264             KMIME_WARN << "attribute ends with \"*\", but value is empty!"
1265                        "Chopping away \"*\".";
1266             maybeAttribute.chop(1);
1267         }
1268         result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1269         return true;
1270     }
1271
1272     const char *oldscursor = scursor;
1273
1274     //
1275     // parse the parameter value:
1276     //
1277     QStringOrQPair maybeValue;
1278     if (*scursor == '"') {
1279         // value is a quoted-string:
1280         scursor++;
1281         if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1282             // attributes ending with "*" designate extended-parameters,
1283             // which cannot have quoted-strings as values. So we remove the
1284             // trailing "*" to not confuse upper layers.
1285             KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
1286                        "Chopping away \"*\".";
1287             maybeAttribute.chop(1);
1288         }
1289
1290         if (!parseGenericQuotedString(scursor, send, maybeValue.qstring, isCRLF)) {
1291             scursor = oldscursor;
1292             result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1293             return false; // this case needs further processing by upper layers!!
1294         }
1295     } else {
1296         // value is a token:
1297         if (!parseToken(scursor, send, maybeValue.qpair, ParseTokenRelaxedTText)) {
1298             scursor = oldscursor;
1299             result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1300             return false; // this case needs further processing by upper layers!!
1301         }
1302     }
1303
1304     result = qMakePair(maybeAttribute.toLower(), maybeValue);
1305     return true;
1306 }
1307
1308 static bool parseRawParameterList(const char *&scursor, const char *const send,
1309                                   QMap<QString, QStringOrQPair> &result,
1310                                   bool isCRLF)
1311 {
1312     // we use parseParameter() consecutively to obtain a map of raw
1313     // attributes to raw values. "Raw" here means that we don't do
1314     // rfc2231 decoding and concatenation. This is left to
1315     // parseParameterList(), which will call this function.
1316     //
1317     // The main reason for making this chunk of code a separate
1318     // (private) method is that we can deal with broken parameters
1319     // _here_ and leave the rfc2231 handling solely to
1320     // parseParameterList(), which will still be enough work.
1321     while (scursor != send) {
1322         eatCFWS(scursor, send, isCRLF);
1323         // empty entry ending the list: OK.
1324         if (scursor == send) {
1325             return true;
1326         }
1327         // empty list entry: ignore.
1328         if (*scursor == ';') {
1329             scursor++;
1330             continue;
1331         }
1332         QPair<QString, QStringOrQPair> maybeParameter;
1333         if (!parseParameter(scursor, send, maybeParameter, isCRLF)) {
1334             // we need to do a bit of work if the attribute is not
1335             // NULL. These are the cases marked with "needs further
1336             // processing" in parseParameter(). Specifically, parsing of the
1337             // token or the quoted-string, which should represent the value,
1338             // failed. We take the easy way out and simply search for the
1339             // next ';' to start parsing again. (Another option would be to
1340             // take the text between '=' and ';' as value)
1341             if (maybeParameter.first.isNull()) {
1342                 return false;
1343             }
1344             while (scursor != send) {
1345                 if (*scursor++ == ';') {
1346                     goto IS_SEMICOLON;
1347                 }
1348             }
1349             // scursor == send case: end of list.
1350             return true;
1351         IS_SEMICOLON:
1352             // *scursor == ';' case: parse next entry.
1353             continue;
1354         }
1355         // successful parsing brings us here:
1356         result.insert(maybeParameter.first, maybeParameter.second);
1357
1358         eatCFWS(scursor, send, isCRLF);
1359         // end of header: ends list.
1360         if (scursor == send) {
1361             return true;
1362         }
1363         // regular separator: eat it.
1364         if (*scursor == ';') {
1365             scursor++;
1366         }
1367     }
1368     return true;
1369 }
1370
1371 static void decodeRFC2231Value(KCodecs::Codec *&rfc2231Codec,
1372                                QStringDecoder &textcodec,
1373                                bool isContinuation, QString &value,
1374                                QPair<const char *, int> &source, QByteArray &charset)
1375 {
1376     //
1377     // parse the raw value into (charset,language,text):
1378     //
1379
1380     const char *decBegin = source.first;
1381     const char *decCursor = decBegin;
1382     const char *decEnd = decCursor + source.second;
1383
1384     if (!isContinuation) {
1385         // find the first single quote
1386         while (decCursor != decEnd) {
1387             if (*decCursor == '\'') {
1388                 break;
1389             } else {
1390                 decCursor++;
1391             }
1392         }
1393
1394         if (decCursor == decEnd) {
1395             // there wasn't a single single quote at all!
1396             // take the whole value to be in latin-1:
1397             KMIME_WARN << "No charset in extended-initial-value."
1398                        "Assuming \"iso-8859-1\".";
1399             value += QString::fromLatin1(decBegin, source.second);
1400             return;
1401         }
1402
1403         charset = QByteArray(decBegin, decCursor - decBegin);
1404
1405         const char *oldDecCursor = ++decCursor;
1406         // find the second single quote (we ignore the language tag):
1407         while (decCursor != decEnd) {
1408             if (*decCursor == '\'') {
1409                 break;
1410             } else {
1411                 decCursor++;
1412             }
1413         }
1414         if (decCursor == decEnd) {
1415             KMIME_WARN << "No language in extended-initial-value."
1416                        "Trying to recover.";
1417             decCursor = oldDecCursor;
1418         } else {
1419             decCursor++;
1420         }
1421
1422         // decCursor now points to the start of the
1423         // "extended-other-values":
1424
1425         //
1426         // get the decoders:
1427         //
1428
1429         textcodec = QStringDecoder(charset.constData());
1430         if (!textcodec.isValid()) {
1431             KMIME_WARN_UNKNOWN(Charset, charset);
1432         }
1433     }
1434
1435     if (!rfc2231Codec) {
1436         rfc2231Codec = KCodecs::Codec::codecForName("x-kmime-rfc2231");
1437         assert(rfc2231Codec);
1438     }
1439
1440     if (!textcodec.isValid()) {
1441         value += QString::fromLatin1(decCursor, decEnd - decCursor);
1442         return;
1443     }
1444
1445     KCodecs::Decoder *dec = rfc2231Codec->makeDecoder();
1446     assert(dec);
1447
1448     //
1449     // do the decoding:
1450     //
1451
1452     QByteArray buffer;
1453     buffer.resize(rfc2231Codec->maxDecodedSizeFor(decEnd - decCursor));
1454     QByteArray::Iterator bit = buffer.begin();
1455     QByteArray::ConstIterator bend = buffer.end();
1456
1457     if (!dec->decode(decCursor, decEnd, bit, bend)) {
1458         KMIME_WARN << rfc2231Codec->name()
1459                    << "codec lies about its maxDecodedSizeFor()"
1460                    << Qt::endl
1461                    << "result may be truncated";
1462     }
1463
1464     value += textcodec.decode(QByteArrayView(buffer.begin(), bit - buffer.begin()));
1465
1466     // qCDebug(KMIME_LOG) << "value now: \"" << value << "\"";
1467     // cleanup:
1468     delete dec;
1469 }
1470
1471 // known issues:
1472 //  - permutes rfc2231 continuations when the total number of parts
1473 //    exceeds 10 (other-sections then becomes *xy, ie. two digits)
1474
1475 bool parseParameterListWithCharset(const char *&scursor,
1476                                    const char *const send,
1477                                    QMap<QString, QString> &result,
1478                                    QByteArray &charset, bool isCRLF)
1479 {
1480 // parse the list into raw attribute-value pairs:
1481     QMap<QString, QStringOrQPair> rawParameterList;
1482     if (!parseRawParameterList(scursor, send, rawParameterList, isCRLF)) {
1483         return false;
1484     }
1485
1486     if (rawParameterList.isEmpty()) {
1487         return true;
1488     }
1489
1490     // decode rfc 2231 continuations and alternate charset encoding:
1491
1492     // NOTE: this code assumes that what QMapIterator delivers is sorted
1493     // by the key!
1494
1495     KCodecs::Codec *rfc2231Codec = nullptr;
1496     QStringDecoder textcodec;
1497     QString attribute;
1498     QString value;
1499     enum Mode {
1500         NoMode = 0x0, Continued = 0x1, Encoded = 0x2
1501     };
1502
1503     enum EncodingMode {
1504         NoEncoding,
1505         RFC2047,
1506         RFC2231
1507     };
1508
1509     QMap<QString, QStringOrQPair>::Iterator it;
1510     QMap<QString, QStringOrQPair>::Iterator end = rawParameterList.end();
1511
1512     for (it = rawParameterList.begin() ; it != end ; ++it) {
1513         if (attribute.isNull() || !it.key().startsWith(attribute)) {
1514             //
1515             // new attribute:
1516             //
1517
1518             // store the last attribute/value pair in the result map now:
1519             if (!attribute.isNull()) {
1520                 result.insert(attribute, value);
1521             }
1522             // and extract the information from the new raw attribute:
1523             value.clear();
1524             attribute = it.key();
1525             int mode = NoMode;
1526             EncodingMode encodingMode = NoEncoding;
1527
1528             // is the value rfc2331-encoded?
1529             if (attribute.endsWith(QLatin1Char('*'))) {
1530                 attribute.chop(1);
1531                 mode |= Encoded;
1532                 encodingMode = RFC2231;
1533             }
1534             // is the value rfc2047-encoded?
1535             if (!(*it).qstring.isNull() &&
1536                 (*it).qstring.contains(QLatin1StringView("=?"))) {
1537               mode |= Encoded;
1538               encodingMode = RFC2047;
1539             }
1540             // is the value continued?
1541             if (attribute.endsWith(QLatin1StringView("*0"))) {
1542               attribute.chop(2);
1543               mode |= Continued;
1544             }
1545             //
1546             // decode if necessary:
1547             //
1548             if (mode & Encoded) {
1549                 if (encodingMode == RFC2231) {
1550                     decodeRFC2231Value(rfc2231Codec, textcodec,
1551                                        false, /* isn't continuation */
1552                                        value, (*it).qpair, charset);
1553                 } else if (encodingMode == RFC2047) {
1554                     value += KCodecs::decodeRFC2047String((*it).qstring.toLatin1(), &charset);
1555                 }
1556             } else {
1557                 // not encoded.
1558                 if ((*it).qpair.first) {
1559                     value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1560                 } else {
1561                     value += (*it).qstring;
1562                 }
1563             }
1564
1565             //
1566             // shortcut-processing when the value isn't encoded:
1567             //
1568
1569             if (!(mode & Continued)) {
1570                 // save result already:
1571                 result.insert(attribute, value);
1572                 // force begin of a new attribute:
1573                 attribute.clear();
1574             }
1575         } else { // it.key().startsWith( attribute )
1576             //
1577             // continuation
1578             //
1579
1580             // ignore the section and trust QMap to have sorted the keys:
1581             if (it.key().endsWith(QLatin1Char('*'))) {
1582                 // encoded
1583                 decodeRFC2231Value(rfc2231Codec, textcodec,
1584                                    true, /* is continuation */
1585                                    value, (*it).qpair, charset);
1586             } else {
1587                 // not encoded
1588                 if ((*it).qpair.first) {
1589                     value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1590                 } else {
1591                     value += (*it).qstring;
1592                 }
1593             }
1594         }
1595     }
1596     // write last attr/value pair:
1597     if (!attribute.isNull()) {
1598         result.insert(attribute, value);
1599     }
1600
1601     return true;
1602 }
1603
1604 bool parseParameterList(const char *&scursor, const char *const send,
1605                         QMap<QString, QString> &result, bool isCRLF)
1606 {
1607     QByteArray charset;
1608     return parseParameterListWithCharset(scursor, send, result, charset, isCRLF);
1609 }
1610
1611 static const char stdDayNames[][4] = {
1612     "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
1613 };
1614 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
1615
1616 static bool parseDayName(const char *&scursor, const char *const send)
1617 {
1618     // check bounds:
1619     if (send - scursor < 3) {
1620         return false;
1621     }
1622
1623     for (int i = 0 ; i < stdDayNamesLen ; ++i) {
1624         if (qstrnicmp(scursor, stdDayNames[i], 3) == 0) {
1625             scursor += 3;
1626             // qCDebug(KMIME_LOG) << "found" << stdDayNames[i];
1627             return true;
1628         }
1629     }
1630
1631     return false;
1632 }
1633
1634 static const char stdMonthNames[][4] = {
1635     "Jan", "Feb", "Mar", "Apr", "May", "Jun",
1636     "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
1637 };
1638 static const int stdMonthNamesLen =
1639     sizeof stdMonthNames / sizeof *stdMonthNames;
1640
1641 static bool parseMonthName(const char *&scursor, const char *const send,
1642                            int &result)
1643 {
1644     // check bounds:
1645     if (send - scursor < 3) {
1646         return false;
1647     }
1648
1649     for (result = 0 ; result < stdMonthNamesLen ; ++result) {
1650         if (qstrnicmp(scursor, stdMonthNames[result], 3) == 0) {
1651             scursor += 3;
1652             return true;
1653         }
1654     }
1655
1656     // not found:
1657     return false;
1658 }
1659
1660 static const struct {
1661     const char tzName[5];
1662     long int secsEastOfGMT;
1663 } timeZones[] = {
1664     // rfc 822 timezones:
1665     { "GMT", 0 },
1666     { "UT", 0 },
1667     { "EDT", -4 * 3600 },
1668     { "EST", -5 * 3600 },
1669     { "MST", -5 * 3600 },
1670     { "CST", -6 * 3600 },
1671     { "MDT", -6 * 3600 },
1672     { "MST", -7 * 3600 },
1673     { "PDT", -7 * 3600 },
1674     { "PST", -8 * 3600 },
1675     // common, non-rfc-822 zones:
1676     { "CET", 1 * 3600 },
1677     { "MET", 1 * 3600 },
1678     { "UTC", 0 },
1679     { "CEST", 2 * 3600 },
1680     { "BST", 1 * 3600 },
1681     // rfc 822 military timezones:
1682     { "Z", 0 },
1683     { "A", -1 * 3600 },
1684     { "B", -2 * 3600 },
1685     { "C", -3 * 3600 },
1686     { "D", -4 * 3600 },
1687     { "E", -5 * 3600 },
1688     { "F", -6 * 3600 },
1689     { "G", -7 * 3600 },
1690     { "H", -8 * 3600 },
1691     { "I", -9 * 3600 },
1692     // J is not used!
1693     { "K", -10 * 3600 },
1694     { "L", -11 * 3600 },
1695     { "M", -12 * 3600 },
1696     { "N", 1 * 3600 },
1697     { "O", 2 * 3600 },
1698     { "P", 3 * 3600 },
1699     { "Q", 4 * 3600 },
1700     { "R", 5 * 3600 },
1701     { "S", 6 * 3600 },
1702     { "T", 7 * 3600 },
1703     { "U", 8 * 3600 },
1704     { "V", 9 * 3600 },
1705     { "W", 10 * 3600 },
1706     { "X", 11 * 3600 },
1707     { "Y", 12 * 3600 },
1708 };
1709 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
1710
1711 static bool parseAlphaNumericTimeZone(const char *&scursor,
1712                                       const char *const send,
1713                                       long int &secsEastOfGMT,
1714                                       bool &timeZoneKnown)
1715 {
1716     // allow the timezone to be wrapped in quotes; bug 260761
1717     if (scursor < send && *scursor == '"') {
1718         scursor++;
1719
1720         if (scursor == send) {
1721             return false;
1722         }
1723     }
1724
1725     QPair<const char *, int> maybeTimeZone(nullptr, 0);
1726     if (!parseToken(scursor, send, maybeTimeZone, ParseTokenNoFlag)) {
1727         return false;
1728     }
1729     for (int i = 0 ; i < timeZonesLen ; ++i) {
1730         if (qstrnicmp(timeZones[i].tzName,
1731                       maybeTimeZone.first, maybeTimeZone.second) == 0) {
1732             scursor += maybeTimeZone.second;
1733             secsEastOfGMT = timeZones[i].secsEastOfGMT;
1734             timeZoneKnown = true;
1735
1736             if (scursor < send && *scursor == '"') {
1737                 scursor++;
1738             }
1739
1740             return true;
1741         }
1742     }
1743
1744     // don't choke just because we don't happen to know the time zone
1745     KMIME_WARN_UNKNOWN(time zone,
1746                        QByteArray(maybeTimeZone.first, maybeTimeZone.second));
1747     secsEastOfGMT = 0;
1748     timeZoneKnown = false;
1749     return true;
1750 }
1751
1752 // parse a number and return the number of digits parsed:
1753 int parseDigits(const char *&scursor, const char *const send, int &result)
1754 {
1755     result = 0;
1756     int digits = 0;
1757     for (; scursor != send && isdigit(*scursor) ; scursor++, digits++) {
1758         result *= 10;
1759         result += int(*scursor - '0');
1760     }
1761     return digits;
1762 }
1763
1764 static bool parseTimeOfDay(const char *&scursor, const char *const send,
1765                            int &hour, int &min, int &sec, bool isCRLF = false)
1766 {
1767     // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
1768
1769     //
1770     // 2DIGIT representing "hour":
1771     //
1772     if (!parseDigits(scursor, send, hour)) {
1773         return false;
1774     }
1775
1776     eatCFWS(scursor, send, isCRLF);
1777     if (scursor == send || *scursor != ':') {
1778         return false;
1779     }
1780     scursor++; // eat ':'
1781
1782     eatCFWS(scursor, send, isCRLF);
1783     if (scursor == send) {
1784         return false;
1785     }
1786
1787     //
1788     // 2DIGIT representing "minute":
1789     //
1790     if (!parseDigits(scursor, send, min)) {
1791         return false;
1792     }
1793
1794     eatCFWS(scursor, send, isCRLF);
1795     if (scursor == send) {
1796         return true; // seconds are optional
1797     }
1798
1799     //
1800     // let's see if we have a 2DIGIT representing "second":
1801     //
1802     if (*scursor == ':') {
1803         // yepp, there are seconds:
1804         scursor++; // eat ':'
1805         eatCFWS(scursor, send, isCRLF);
1806         if (scursor == send) {
1807             return false;
1808         }
1809
1810         if (!parseDigits(scursor, send, sec)) {
1811             return false;
1812         }
1813     } else {
1814         sec = 0;
1815     }
1816
1817     return true;
1818 }
1819
1820 bool parseTime(const char *&scursor, const char *send,
1821                int &hour, int &min, int &sec, long int &secsEastOfGMT,
1822                bool &timeZoneKnown, bool isCRLF)
1823 {
1824     // time := time-of-day CFWS ( zone / obs-zone )
1825     //
1826     // obs-zone    := "UT" / "GMT" /
1827     //                "EST" / "EDT" / ; -0500 / -0400
1828     //                "CST" / "CDT" / ; -0600 / -0500
1829     //                "MST" / "MDT" / ; -0700 / -0600
1830     //                "PST" / "PDT" / ; -0800 / -0700
1831     //                "A"-"I" / "a"-"i" /
1832     //                "K"-"Z" / "k"-"z"
1833
1834     eatCFWS(scursor, send, isCRLF);
1835     if (scursor == send) {
1836         return false;
1837     }
1838
1839     if (!parseTimeOfDay(scursor, send, hour, min, sec, isCRLF)) {
1840         return false;
1841     }
1842
1843     eatCFWS(scursor, send, isCRLF);
1844     // there might be no timezone but a year following
1845     if ((scursor == send) || isdigit(*scursor)) {
1846         timeZoneKnown = false;
1847         secsEastOfGMT = 0;
1848         return true; // allow missing timezone
1849     }
1850
1851     timeZoneKnown = true;
1852     if (*scursor == '+' || *scursor == '-') {
1853         // remember and eat '-'/'+':
1854         const char sign = *scursor++;
1855         // numerical timezone:
1856         int maybeTimeZone;
1857         const int tzDigits = parseDigits(scursor, send, maybeTimeZone);
1858         if (tzDigits != 4) {
1859             // Allow timezones in 02:00 format
1860             if (tzDigits == 2 && scursor != send && *scursor == ':') {
1861                 scursor++;
1862                 int maybeTimeZone2;
1863                 if (parseDigits(scursor, send, maybeTimeZone2) != 2) {
1864                     return false;
1865                 }
1866                 maybeTimeZone = maybeTimeZone * 100 + maybeTimeZone2;
1867             } else {
1868                 return false;
1869             }
1870         }
1871         secsEastOfGMT = 60 * (maybeTimeZone / 100 * 60 + maybeTimeZone % 100);
1872         if (sign == '-') {
1873             secsEastOfGMT *= -1;
1874             if (secsEastOfGMT == 0) {
1875                 timeZoneKnown = false; // -0000 means indetermined tz
1876             }
1877         }
1878     } else {
1879         // maybe alphanumeric timezone:
1880         if (!parseAlphaNumericTimeZone(scursor, send, secsEastOfGMT, timeZoneKnown)) {
1881             return false;
1882         }
1883     }
1884     return true;
1885 }
1886
1887 bool parseQDateTime(const char *&scursor, const char *const send,
1888                    QDateTime &result, bool isCRLF)
1889 {
1890     eatCFWS(scursor, send, isCRLF);
1891     if (scursor == send) {
1892         return false;
1893     }
1894     // In qt6 yy == 1900 ! => for sure we use 2000 here.
1895     result = QDateTime::fromString(QString::fromLatin1(scursor, 17), QStringLiteral("dd/MM/yy HH:mm:ss"));
1896     QDate resultDate = result.date();
1897     resultDate.setDate(resultDate.year() + 100, resultDate.month(), resultDate.day());
1898     result.setDate(resultDate);
1899     return result.isValid();
1900 }
1901
1902 bool parseDateTime(const char *&scursor, const char *const send,
1903                    QDateTime &result, bool isCRLF)
1904 {
1905     // Parsing date-time; strict mode:
1906     //
1907     // date-time   := [ [CFWS] day-name [CFWS] "," ]                      ; wday
1908     // (expanded)     [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
1909     //                time
1910     //
1911     // day-name    := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
1912     // month-name  := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
1913     //                "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
1914
1915     result = QDateTime();
1916
1917     eatCFWS(scursor, send, isCRLF);
1918     if (scursor == send) {
1919         return false;
1920     }
1921
1922     //
1923     // let's see if there's a day-of-week:
1924     //
1925     if (parseDayName(scursor, send)) {
1926         eatCFWS(scursor, send, isCRLF);
1927         if (scursor == send) {
1928             return false;
1929         }
1930         // day-name should be followed by ',' but we treat it as optional:
1931         if (*scursor == ',') {
1932             scursor++; // eat ','
1933             eatCFWS(scursor, send, isCRLF);
1934         }
1935     }
1936
1937     int maybeMonth = -1;
1938     bool asctimeFormat = false;
1939
1940     // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993
1941     if (!isdigit(*scursor) && parseMonthName(scursor, send, maybeMonth)) {
1942         asctimeFormat = true;
1943         eatCFWS(scursor, send, isCRLF);
1944     }
1945
1946     //
1947     // 1*2DIGIT representing "day" (of month):
1948     //
1949     int maybeDay;
1950     if (!parseDigits(scursor, send, maybeDay)) {
1951         return false;
1952     }
1953
1954     eatCFWS(scursor, send, isCRLF);
1955     if (scursor == send) {
1956         return false;
1957     }
1958
1959     // ignore ","; bug 54098
1960     if (*scursor == ',') {
1961         scursor++;
1962     }
1963
1964     //
1965     // month-name:
1966     //
1967     if (!asctimeFormat && !parseMonthName(scursor, send, maybeMonth)) {
1968         return false;
1969     }
1970     if (scursor == send) {
1971         return false;
1972     }
1973     assert(maybeMonth >= 0); assert(maybeMonth <= 11);
1974     ++maybeMonth; // 0-11 -> 1-12
1975
1976     eatCFWS(scursor, send, isCRLF);
1977     if (scursor == send) {
1978         return false;
1979     }
1980
1981     // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS")
1982     bool timeAfterYear = true;
1983     if ((send - scursor > 3) && ((scursor[1] == ':') || (scursor[2] == ':'))) {
1984         timeAfterYear = false;  // first read time, then year
1985     }
1986
1987     //
1988     // 2*DIGIT representing "year":
1989     //
1990     int maybeYear = 0;
1991
1992     if (timeAfterYear && !parseDigits(scursor, send, maybeYear)) {
1993         return false;
1994     }
1995
1996     eatCFWS(scursor, send, isCRLF);
1997     int maybeHour;
1998     int maybeMinute;
1999     int maybeSecond;
2000     long int secsEastOfGMT = 0;
2001     QDate maybeDate;
2002     QTime maybeTime;
2003     if (scursor != send) {
2004         //
2005         // time
2006         //
2007         bool timeZoneKnown = true;
2008
2009         if (!parseTime(scursor, send,
2010                        maybeHour, maybeMinute, maybeSecond,
2011                        secsEastOfGMT, timeZoneKnown, isCRLF)) {
2012             return false;
2013         }
2014
2015         // in asctime() the year follows the time
2016         if (!timeAfterYear) {
2017             eatCFWS(scursor, send, isCRLF);
2018             if (scursor == send) {
2019                 return false;
2020             }
2021
2022             if (!parseDigits(scursor, send, maybeYear)) {
2023                 return false;
2024             }
2025         }
2026
2027         // RFC 2822 4.3 processing:
2028         if (maybeYear < 50) {
2029             maybeYear += 2000;
2030         } else if (maybeYear < 1000) {
2031             maybeYear += 1900;
2032         }
2033         // else keep as is
2034         if (maybeYear < 1900) {
2035             return false; // rfc2822, 3.3
2036         }
2037
2038         maybeDate = QDate(maybeYear, maybeMonth, maybeDay);
2039         maybeTime = QTime(maybeHour, maybeMinute, maybeSecond);
2040
2041         if (!maybeDate.isValid() || !maybeTime.isValid()) {
2042             return false;
2043         }
2044     } else {
2045         maybeDate = QDate(maybeYear, maybeMonth, maybeDay);
2046         maybeTime = QTime(0, 0, 0);
2047     }
2048
2049     result = QDateTime(maybeDate, maybeTime, QTimeZone::fromSecondsAheadOfUtc(secsEastOfGMT));
2050     if (!result.isValid()) {
2051         return false;
2052     }
2053     return true;
2054 }
2055
2056 namespace {
2057
2058 Headers::Base *extractHeader(QByteArrayView head, const int headerStart, int &endOfFieldBody)
2059 {
2060     Headers::Base *header = {};
2061
2062     int startOfFieldBody = head.indexOf(':', headerStart);
2063     if (startOfFieldBody < 0) {
2064         return nullptr;
2065     }
2066
2067     const char *rawType = head.constData() + headerStart;
2068     const size_t rawTypeLen = startOfFieldBody - headerStart;
2069
2070     startOfFieldBody++; //skip the ':'
2071     if (startOfFieldBody < head.size() - 1 &&  head[startOfFieldBody] == ' ') { // skip the space after the ':', if there's any
2072         startOfFieldBody++;
2073     }
2074
2075     bool folded = false;
2076     endOfFieldBody = findHeaderLineEnd(head, startOfFieldBody, &folded);
2077
2078     // We might get an invalid mail without a field name, don't crash on that.
2079     if (rawTypeLen > 0) {
2080         header = HeaderFactory::createHeader(rawType, rawTypeLen);
2081     }
2082     if (!header) {
2083         //qCWarning(KMIME_LOG)() << "Returning Generic header of type" << rawType;
2084         header = new Headers::Generic(rawType, rawTypeLen);
2085     }
2086     if (folded) {
2087         const auto unfoldedBody = unfoldHeader(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2088         header->from7BitString(unfoldedBody);
2089     } else {
2090         header->from7BitString(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2091     }
2092
2093     return header;
2094 }
2095
2096 }
2097
2098 std::unique_ptr<KMime::Headers::Base> parseNextHeader(QByteArrayView &head)
2099 {
2100     int endOfFieldBody = 0;
2101     std::unique_ptr<KMime::Headers::Base> header(extractHeader(head, 0, endOfFieldBody));
2102     if (header) {
2103         head = head.mid(endOfFieldBody + 1);
2104     } else {
2105         head = {};
2106     }
2107
2108     return header;
2109 }
2110
2111 void extractHeaderAndBody(const QByteArray &content, QByteArray &header, QByteArray &body)
2112 {
2113     header.clear();
2114     body.clear();
2115
2116     // empty header
2117     if (content.startsWith('\n')) {
2118         body = content.right(content.length() - 1);
2119         return;
2120     }
2121
2122     int pos = content.indexOf("\n\n", 0);
2123     if (pos > -1) {
2124         header = content.left(++pos);    //header *must* end with "\n" !!
2125         body = content.mid(pos + 1);
2126         if (body.startsWith("\n")) {
2127             body = "\n" + body;
2128         }
2129     } else {
2130         header = content;
2131     }
2132 }
2133
2134 QList<Headers::Base *> parseHeaders(const QByteArray &head) {
2135     QList<Headers::Base *> ret;
2136
2137     int cursor = 0;
2138     while (cursor < head.size()) {
2139         const int headerStart = cursor;
2140         int endOfFieldBody;
2141         if (auto header = extractHeader(head, headerStart, endOfFieldBody)) {
2142             ret << header;
2143             cursor = endOfFieldBody + 1;
2144         } else {
2145             break;
2146         }
2147     }
2148
2149     return ret;
2150 }
2151
2152 } // namespace HeaderParsing
2153
2154 } // namespace KMime