kmime/src/kmime_header_parsing.cpp

0001 /*  -*- c++ -*-
0002     kmime_header_parsing.cpp
0003
0004     KMime, the KDE Internet mail/usenet news message library.
0005     SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
0006
0007     SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009
0010 #include "kmime_header_parsing.h"
0011
0012 #include "kmime_headerfactory_p.h"
0013 #include "kmime_headers.h"
0014 #include "kmime_util.h"
0015 #include "kmime_util_p.h"
0016 #include "kmime_codecs_p.h"
0017 #include "kmime_dateformatter.h"
0018 #include "kmime_debug.h"
0019 #include "kmime_warning_p.h"
0020
0021 #include <KCodecs>
0022
0023 #include <QMap>
0024 #include <QStringDecoder>
0025 #include <QTimeZone>
0026
0027 #include <cassert>
0028 #include <cctype> // for isdigit
0029
0030 using namespace KMime;
0031 using namespace KMime::Types;
0032
0033 namespace KMime
0034 {
0035
0036     namespace Types
0037     {
0038         // Optimization to avoid allocating QStrings when the value isn't encoded
0039         struct KMIME_EXPORT QStringOrQPair {
0040             QStringOrQPair() : qstring(), qpair(nullptr, 0) {}
0041             QString qstring;
0042             QPair<const char *, int> qpair;
0043         };
0044     } // namespace Types
0045
0046 namespace HeaderParsing
0047 {
0048
0049 // parse the encoded-word (scursor points to after the initial '=')
0050 bool parseEncodedWord(const char *&scursor, const char *const send,
0051                       QString &result, QByteArray &language,
0052                       QByteArray &usedCS, const QByteArray &defaultCS)
0053 {
0054     // make sure the caller already did a bit of the work.
0055     assert(*(scursor - 1) == '=');
0056
0057     //
0058     // STEP 1:
0059     // scan for the charset/language portion of the encoded-word
0060     //
0061
0062     char ch = *scursor++;
0063
0064     if (ch != '?') {
0065         // qCDebug(KMIME_LOG) << "first";
0066         //KMIME_WARN_PREMATURE_END_OF( EncodedWord );
0067         return false;
0068     }
0069
0070     // remember start of charset (ie. just after the initial "=?") and
0071     // language (just after the first '*') fields:
0072     const char *charsetStart = scursor;
0073     const char *languageStart = nullptr;
0074
0075     // find delimiting '?' (and the '*' separating charset and language
0076     // tags, if any):
0077     for (; scursor != send ; scursor++) {
0078         if (*scursor == '?') {
0079             break;
0080         } else if (*scursor == '*' && languageStart == nullptr) {
0081             languageStart = scursor + 1;
0082         }
0083     }
0084
0085     // not found? can't be an encoded-word!
0086     if (scursor == send || *scursor != '?') {
0087         // qCDebug(KMIME_LOG) << "second";
0088         KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0089         return false;
0090     }
0091
0092     // extract the language information, if any (if languageStart is 0,
0093     // language will be null, too):
0094     QByteArray maybeLanguage(languageStart, scursor - languageStart);
0095     // extract charset information (keep in mind: the size given to the
0096     // ctor is one off due to the \0 terminator):
0097     QByteArray maybeCharset(charsetStart,
0098                             (languageStart ? languageStart - 1 : scursor) - charsetStart);
0099
0100     //
0101     // STEP 2:
0102     // scan for the encoding portion of the encoded-word
0103     //
0104
0105     // remember start of encoding (just _after_ the second '?'):
0106     scursor++;
0107     const char *encodingStart = scursor;
0108
0109     // find next '?' (ending the encoding tag):
0110     for (; scursor != send ; scursor++) {
0111         if (*scursor == '?') {
0112             break;
0113         }
0114     }
0115
0116     // not found? Can't be an encoded-word!
0117     if (scursor == send || *scursor != '?') {
0118         // qCDebug(KMIME_LOG) << "third";
0119         KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0120         return false;
0121     }
0122
0123     // extract the encoding information:
0124     QByteArray maybeEncoding(encodingStart, scursor - encodingStart);
0125
0126     // qCDebug(KMIME_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
0127     //         << "\"; language == \"" << maybeLanguage
0128     //         << "\"; encoding == \"" << maybeEncoding << "\"";
0129
0130     //
0131     // STEP 3:
0132     // scan for encoded-text portion of encoded-word
0133     //
0134
0135     // remember start of encoded-text (just after the third '?'):
0136     scursor++;
0137     const char *encodedTextStart = scursor;
0138
0139     // find the '?=' sequence (ending the encoded-text):
0140     for (; scursor != send ; scursor++) {
0141         if (*scursor == '?') {
0142             if (scursor + 1 != send) {
0143                 if (*(scursor + 1) != '=') {     // We expect a '=' after the '?', but we got something else; ignore
0144                     KMIME_WARN << "Stray '?' in q-encoded word, ignoring this.";
0145                     continue;
0146                 } else { // yep, found a '?=' sequence
0147                     scursor += 2;
0148                     break;
0149                 }
0150             } else { // The '?' is the last char, but we need a '=' after it!
0151                 KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0152                 return false;
0153             }
0154         }
0155     }
0156
0157     if (*(scursor - 2) != '?' || *(scursor - 1) != '=' ||
0158             scursor < encodedTextStart + 2) {
0159         KMIME_WARN_PREMATURE_END_OF(EncodedWord);
0160         return false;
0161     }
0162
0163     // set end sentinel for encoded-text:
0164     const char *const encodedTextEnd = scursor - 2;
0165
0166     //
0167     // STEP 4:
0168     // setup decoders for the transfer encoding and the charset
0169     //
0170
0171     // try if there's a codec for the encoding found:
0172     KCodecs::Codec *codec = KCodecs::Codec::codecForName(maybeEncoding);
0173     if (!codec) {
0174         KMIME_WARN_UNKNOWN(Encoding, maybeEncoding);
0175         return false;
0176     }
0177
0178     // get an instance of a corresponding decoder:
0179     KCodecs::Decoder *dec = codec->makeDecoder();
0180     assert(dec);
0181
0182     // try if there's a (text)codec for the charset found:
0183     QStringDecoder textCodec;
0184     if (maybeCharset.isEmpty()) {
0185         textCodec = QStringDecoder(defaultCS.constData());
0186         if (!textCodec.isValid()) {
0187             textCodec = QStringDecoder(QStringDecoder::Latin1);
0188         }
0189         usedCS = cachedCharset(defaultCS);
0190     } else {
0191         textCodec = QStringDecoder(maybeCharset.constData());
0192         if (textCodec.isValid()) {    //no suitable codec found => use default charset
0193             usedCS = cachedCharset(defaultCS);
0194         } else {
0195             textCodec = QStringDecoder(QStringDecoder::Latin1);
0196             usedCS = cachedCharset(maybeCharset);
0197         }
0198     }
0199
0200     if (!textCodec.isValid()) {
0201         KMIME_WARN_UNKNOWN(Charset, maybeCharset);
0202         delete dec;
0203         return false;
0204     };
0205
0206     // qCDebug(KMIME_LOG) << "mimeName(): \"" << textCodec->name() << "\"";
0207
0208     // allocate a temporary buffer to store the 8bit text:
0209     int encodedTextLength = encodedTextEnd - encodedTextStart;
0210     QByteArray buffer;
0211     buffer.resize(codec->maxDecodedSizeFor(encodedTextLength));
0212     char *bbegin = buffer.data();
0213     char *bend = bbegin + buffer.length();
0214
0215     //
0216     // STEP 5:
0217     // do the actual decoding
0218     //
0219
0220     if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) {
0221         KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor("
0222                    << encodedTextLength << ")\nresult may be truncated";
0223     }
0224
0225     result = textCodec.decode(QByteArrayView(buffer.data(), bbegin - buffer.data()));
0226
0227     // qCDebug(KMIME_LOG) << "result now: \"" << result << "\"";
0228     // cleanup:
0229     delete dec;
0230     language = maybeLanguage;
0231
0232     return true;
0233 }
0234
0235 static inline void eatWhiteSpace(const char *&scursor, const char *const send)
0236 {
0237     while (scursor != send &&
0238             (*scursor == ' ' || *scursor == '\n' ||
0239              *scursor == '\t' || *scursor == '\r')) {
0240         scursor++;
0241     }
0242 }
0243
0244 bool parseAtom(const char*&scursor, const char *const send,
0245                QByteArray &result, bool allow8Bit)
0246 {
0247     QPair<const char *, int> maybeResult;
0248
0249     if (parseAtom(scursor, send, maybeResult, allow8Bit)) {
0250         result = QByteArray(maybeResult.first, maybeResult.second);
0251         return true;
0252     }
0253
0254     return false;
0255 }
0256
0257 bool parseAtom(const char*&scursor, const char *const send,
0258                QPair<const char *, int> &result, bool allow8Bit)
0259 {
0260     bool success = false;
0261     const char *start = scursor;
0262
0263     while (scursor != send) {
0264         signed char ch = *scursor++;
0265         if (ch > 0 && isAText(ch)) {
0266             // AText: OK
0267             success = true;
0268         } else if (allow8Bit && ch < 0) {
0269             // 8bit char: not OK, but be tolerant.
0270             KMIME_WARN_8BIT(ch);
0271             success = true;
0272         } else {
0273             // CTL or special - marking the end of the atom:
0274             // re-set sursor to point to the offending
0275             // char and return:
0276             scursor--;
0277             break;
0278         }
0279     }
0280     result.first = start;
0281     result.second = scursor - start;
0282     return success;
0283 }
0284
0285 bool parseToken(const char*&scursor, const char *const send,
0286                 QByteArray &result, ParseTokenFlags flags)
0287 {
0288     QPair<const char *, int> maybeResult;
0289
0290     if (parseToken(scursor, send, maybeResult, flags)) {
0291         result = QByteArray(maybeResult.first, maybeResult.second);
0292         return true;
0293     }
0294
0295     return false;
0296 }
0297
0298 bool parseToken(const char*&scursor, const char *const send,
0299                 QPair<const char *, int> &result, ParseTokenFlags flags)
0300 {
0301     bool success = false;
0302     const char *start = scursor;
0303
0304     while (scursor != send) {
0305         signed char ch = *scursor++;
0306         if (ch > 0 && isTText(ch)) {
0307             // TText: OK
0308             success = true;
0309         } else if ((flags & ParseTokenAllow8Bit) && ch < 0) {
0310             // 8bit char: not OK, but be tolerant.
0311             KMIME_WARN_8BIT(ch);
0312             success = true;
0313         } else if ((flags & ParseTokenRelaxedTText) && ch == '/') {
0314             success = true;
0315         } else {
0316             // CTL or tspecial - marking the end of the atom:
0317             // re-set sursor to point to the offending
0318             // char and return:
0319             scursor--;
0320             break;
0321         }
0322     }
0323     result.first = start;
0324     result.second = scursor - start;
0325     return success;
0326 }
0327
0328 #define READ_ch_OR_FAIL if ( scursor == send ) {        \
0329         KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \
0330         return false;                                       \
0331     } else {                                              \
0332         ch = *scursor++;                                    \
0333     }
0334
0335 // known issues:
0336 //
0337 // - doesn't handle quoted CRLF
0338
0339 bool parseGenericQuotedString(const char *&scursor, const char *const send,
0340                               QString &result, bool isCRLF,
0341                               const char openChar, const char closeChar)
0342 {
0343     // We are in a quoted-string or domain-literal or comment and the
0344     // cursor points to the first char after the openChar.
0345     // We will apply unfolding and quoted-pair removal.
0346     // We return when we either encounter the end or unescaped openChar
0347     // or closeChar.
0348     assert(*(scursor - 1) == openChar || *(scursor - 1) == closeChar);
0349
0350     while (scursor != send) {
0351         char ch = *scursor++;
0352
0353         if (ch == closeChar || ch == openChar) {
0354             // end of quoted-string or another opening char:
0355             // let caller decide what to do.
0356             return true;
0357         }
0358
0359         switch (ch) {
0360         case '\\':      // quoted-pair
0361             // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5
0362             READ_ch_OR_FAIL;
0363             KMIME_WARN_IF_8BIT(ch);
0364             result += QLatin1Char(ch);
0365             break;
0366         case '\r':
0367             // ###
0368             // The case of lonely '\r' is easy to solve, as they're
0369             // not part of Unix Line-ending conventions.
0370             // But I see a problem if we are given Unix-native
0371             // line-ending-mails, where we cannot determine anymore
0372             // whether a given '\n' was part of a CRLF or was occurring
0373             // on it's own.
0374             READ_ch_OR_FAIL;
0375             if (ch != '\n') {
0376                 // CR on it's own...
0377                 KMIME_WARN_LONE(CR);
0378                 result += QLatin1Char('\r');
0379                 scursor--; // points to after the '\r' again
0380             } else {
0381                 // CRLF encountered.
0382                 // lookahead: check for folding
0383                 READ_ch_OR_FAIL;
0384                 if (ch == ' ' || ch == '\t') {
0385                     // correct folding;
0386                     // position cursor behind the CRLF WSP (unfolding)
0387                     // and add the WSP to the result
0388                     result += QLatin1Char(ch);
0389                 } else {
0390                     // this is the "shouldn't happen"-case. There is a CRLF
0391                     // inside a quoted-string without it being part of FWS.
0392                     // We take it verbatim.
0393                     KMIME_WARN_NON_FOLDING(CRLF);
0394                     result += QLatin1String("\r\n");
0395                     // the cursor is decremented again, so's we need not
0396                     // duplicate the whole switch here. "ch" could've been
0397                     // everything (incl. openChar or closeChar).
0398                     scursor--;
0399                 }
0400             }
0401             break;
0402         case '\n':
0403             // Note: CRLF has been handled above already!
0404             // ### LF needs special treatment, depending on whether isCRLF
0405             // is true (we can be sure a lonely '\n' was meant this way) or
0406             // false ('\n' alone could have meant LF or CRLF in the original
0407             // message. This parser assumes CRLF iff the LF is followed by
0408             // either WSP (folding) or NULL (premature end of quoted-string;
0409             // Should be fixed, since NULL is allowed as per rfc822).
0410             READ_ch_OR_FAIL;
0411             if (!isCRLF && (ch == ' ' || ch == '\t')) {
0412                 // folding
0413                 // correct folding
0414                 result += QLatin1Char(ch);
0415             } else {
0416                 // non-folding
0417                 KMIME_WARN_LONE(LF);
0418                 result += QLatin1Char('\n');
0419                 // pos is decremented, so's we need not duplicate the whole
0420                 // switch here. ch could've been everything (incl. <">, "\").
0421                 scursor--;
0422             }
0423             break;
0424         case '=': {
0425             // ### Work around broken clients that send encoded words in quoted-strings
0426             //     For example, older KMail versions.
0427             if (scursor == send) {
0428                 break;
0429             }
0430
0431             const char *oldscursor = scursor;
0432             QString tmp;
0433             QByteArray lang;
0434             QByteArray charset;
0435             if (*scursor++ == '?') {
0436                 --scursor;
0437                 if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
0438                     result += tmp;
0439                     //qDebug() << " tmp " << tmp;
0440                     if (scursor == send) {
0441                         break;
0442                     } else if (*scursor++ == ' ') { //Workaround Bug 362650 thunderbird add space for each new line
0443                         if (scursor == send) {
0444                             --scursor;
0445                             break;
0446                         } else if (*scursor++ == '=') {
0447                             if (scursor == send) {
0448                                 --scursor;
0449                                 --scursor;
0450                                 break;
0451                             } else if (*scursor++ == '?') {
0452                                 --scursor;
0453                                 --scursor;
0454                                 break;
0455                             }
0456                         } else {
0457                             --scursor;
0458                             --scursor;
0459                         }
0460                     } else {
0461                         --scursor;
0462                     }
0463
0464                     break;
0465                 } else {
0466                     scursor = oldscursor;
0467                 }
0468             } else {
0469                 scursor = oldscursor;
0470             }
0471             // fall through
0472             [[fallthrough]];
0473         }
0474         default:
0475             KMIME_WARN_IF_8BIT(ch);
0476             result += QLatin1Char(ch);
0477         }
0478     }
0479
0480     return false;
0481 }
0482
0483 // known issues:
0484 //
0485 // - doesn't handle encoded-word inside comments.
0486
0487 bool parseComment(const char *&scursor, const char *const send,
0488                   QString &result, bool isCRLF, bool reallySave)
0489 {
0490     int commentNestingDepth = 1;
0491     const char *afterLastClosingParenPos = nullptr;
0492     QString maybeCmnt;
0493     const char *oldscursor = scursor;
0494
0495     assert(*(scursor - 1) == '(');
0496
0497     while (commentNestingDepth) {
0498         QString cmntPart;
0499         if (parseGenericQuotedString(scursor, send, cmntPart, isCRLF, '(', ')')) {
0500             assert(*(scursor - 1) == ')' || *(scursor - 1) == '(');
0501             // see the kdoc for above function for the possible conditions
0502             // we have to check:
0503             switch (*(scursor - 1)) {
0504             case ')':
0505                 if (reallySave) {
0506                     // add the chunk that's now surely inside the comment.
0507                     result += maybeCmnt;
0508                     result += cmntPart;
0509                     if (commentNestingDepth > 1) {
0510                         // don't add the outermost ')'...
0511                         result += QLatin1Char(')');
0512                     }
0513                     maybeCmnt.clear();
0514                 }
0515                 afterLastClosingParenPos = scursor;
0516                 --commentNestingDepth;
0517                 break;
0518             case '(':
0519                 if (reallySave) {
0520                     // don't add to "result" yet, because we might find that we
0521                     // are already outside the (broken) comment...
0522                     maybeCmnt += cmntPart;
0523                     maybeCmnt += QLatin1Char('(');
0524                 }
0525                 ++commentNestingDepth;
0526                 break;
0527             default: assert(0);
0528             } // switch
0529         } else {
0530             // !parseGenericQuotedString, ie. premature end
0531             if (afterLastClosingParenPos) {
0532                 scursor = afterLastClosingParenPos;
0533             } else {
0534                 scursor = oldscursor;
0535             }
0536             return false;
0537         }
0538     } // while
0539
0540     return true;
0541 }
0542
0543 // known issues: none.
0544
0545 bool parsePhrase(const char *&scursor, const char *const send,
0546                  QString &result, bool isCRLF)
0547 {
0548     enum {
0549         None, Phrase, Atom, EncodedWord, QuotedString
0550     } found = None;
0551
0552     QString tmp;
0553     QByteArray lang;
0554     QByteArray charset;
0555     QPair<const char *, int> tmpAtom;
0556     const char *successfullyParsed = nullptr;
0557     // only used by the encoded-word branch
0558     const char *oldscursor;
0559     // used to suppress whitespace between adjacent encoded-words
0560     // (rfc2047, 6.2):
0561     bool lastWasEncodedWord = false;
0562
0563     while (scursor != send) {
0564         char ch = *scursor++;
0565         switch (ch) {
0566         case '.': // broken, but allow for intorop's sake
0567             if (found == None) {
0568                 --scursor;
0569                 return false;
0570             } else {
0571                 if (scursor != send && (*scursor == ' ' || *scursor == '\t')) {
0572                     result += QLatin1String(". ");
0573                 } else {
0574                     result += QLatin1Char('.');
0575                 }
0576                 successfullyParsed = scursor;
0577             }
0578             break;
0579         case '"': // quoted-string
0580             tmp.clear();
0581             if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
0582                 successfullyParsed = scursor;
0583                 assert(*(scursor - 1) == '"');
0584                 switch (found) {
0585                 case None:
0586                     found = QuotedString;
0587                     break;
0588                 case Phrase:
0589                 case Atom:
0590                 case EncodedWord:
0591                 case QuotedString:
0592                     found = Phrase;
0593                     result += QLatin1Char(' ');   // rfc822, 3.4.4
0594                     break;
0595                 default:
0596                     assert(0);
0597                 }
0598                 lastWasEncodedWord = false;
0599                 result += tmp;
0600             } else {
0601                 // premature end of quoted string.
0602                 // What to do? Return leading '"' as special? Return as quoted-string?
0603                 // We do the latter if we already found something, else signal failure.
0604                 if (found == None) {
0605                     return false;
0606                 } else {
0607                     result += QLatin1Char(' ');   // rfc822, 3.4.4
0608                     result += tmp;
0609                     return true;
0610                 }
0611             }
0612             break;
0613         case '(': // comment
0614             // parse it, but ignore content:
0615             tmp.clear();
0616             if (parseComment(scursor, send, tmp, isCRLF,
0617                              false /*don't bother with the content*/)) {
0618                 successfullyParsed = scursor;
0619                 lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2
0620             } else {
0621                 if (found == None) {
0622                     return false;
0623                 } else {
0624                     scursor = successfullyParsed;
0625                     return true;
0626                 }
0627             }
0628             break;
0629         case '=': // encoded-word
0630             tmp.clear();
0631             oldscursor = scursor;
0632             lang.clear();
0633             charset.clear();
0634             if (parseEncodedWord(scursor, send, tmp, lang, charset)) {
0635                 successfullyParsed = scursor;
0636                 switch (found) {
0637                 case None:
0638                     found = EncodedWord;
0639                     break;
0640                 case Phrase:
0641                 case EncodedWord:
0642                 case Atom:
0643                 case QuotedString:
0644                     if (!lastWasEncodedWord) {
0645                         result += QLatin1Char(' ');   // rfc822, 3.4.4
0646                     }
0647                     found = Phrase;
0648                     break;
0649                 default: assert(0);
0650                 }
0651                 lastWasEncodedWord = true;
0652                 result += tmp;
0653                 break;
0654             } else {
0655                 // parse as atom:
0656                 scursor = oldscursor;
0657             }
0658             [[fallthrough]];
0659             // fall though...
0660
0661         default: //atom
0662             scursor--;
0663             if (parseAtom(scursor, send, tmpAtom, true /* allow 8bit */)) {
0664                 successfullyParsed = scursor;
0665                 switch (found) {
0666                 case None:
0667                     found = Atom;
0668                     break;
0669                 case Phrase:
0670                 case Atom:
0671                 case EncodedWord:
0672                 case QuotedString:
0673                     found = Phrase;
0674                     result += QLatin1Char(' ');   // rfc822, 3.4.4
0675                     break;
0676                 default:
0677                     assert(0);
0678                 }
0679                 lastWasEncodedWord = false;
0680                 result += QLatin1String(tmpAtom.first, tmpAtom.second);
0681             } else {
0682                 if (found == None) {
0683                     return false;
0684                 } else {
0685                     scursor = successfullyParsed;
0686                     return true;
0687                 }
0688             }
0689         }
0690         eatWhiteSpace(scursor, send);
0691     }
0692
0693     return found != None;
0694 }
0695
0696 bool parseDotAtom(const char *&scursor, const char *const send,
0697                   QByteArray &result, bool isCRLF)
0698 {
0699     eatCFWS(scursor, send, isCRLF);
0700
0701     // always points to just after the last atom parsed:
0702     const char *successfullyParsed;
0703
0704     QByteArray tmp;
0705     if (!parseAtom(scursor, send, tmp, false /* no 8bit */)) {
0706         return false;
0707     }
0708     result += tmp;
0709     successfullyParsed = scursor;
0710
0711     while (scursor != send) {
0712
0713         // end of header or no '.' -> return
0714         if (scursor == send || *scursor != '.') {
0715             return true;
0716         }
0717         scursor++; // eat '.'
0718
0719         if (scursor == send || !isAText(*scursor)) {
0720             // end of header or no AText, but this time following a '.'!:
0721             // reset cursor to just after last successfully parsed char and
0722             // return:
0723             scursor = successfullyParsed;
0724             return true;
0725         }
0726
0727         // try to parse the next atom:
0728         QByteArray maybeAtom;
0729         if (!parseAtom(scursor, send, maybeAtom, false /*no 8bit*/)) {
0730             scursor = successfullyParsed;
0731             return true;
0732         }
0733
0734         result += '.';
0735         result += maybeAtom;
0736         successfullyParsed = scursor;
0737     }
0738
0739     scursor = successfullyParsed;
0740     return true;
0741 }
0742
0743 void eatCFWS(const char *&scursor, const char *const send, bool isCRLF)
0744 {
0745     QString dummy;
0746
0747     while (scursor != send) {
0748         const char *oldscursor = scursor;
0749
0750         char ch = *scursor++;
0751
0752         switch (ch) {
0753         case ' ':
0754         case '\t': // whitespace
0755         case '\r':
0756         case '\n': // folding
0757             continue;
0758
0759         case '(': // comment
0760             if (parseComment(scursor, send, dummy, isCRLF, false /*don't save*/)) {
0761                 continue;
0762             }
0763             scursor = oldscursor;
0764             return;
0765
0766         default:
0767             scursor = oldscursor;
0768             return;
0769         }
0770     }
0771 }
0772
0773 bool parseDomain(const char *&scursor, const char *const send,
0774                  QString &result, bool isCRLF)
0775 {
0776     eatCFWS(scursor, send, isCRLF);
0777     if (scursor == send) {
0778         return false;
0779     }
0780
0781     // domain := dot-atom / domain-literal / atom *("." atom)
0782     //
0783     // equivalent to:
0784     // domain = dot-atom / domain-literal,
0785     // since parseDotAtom does allow CFWS between atoms and dots
0786
0787     if (*scursor == '[') {
0788         // domain-literal:
0789         QString maybeDomainLiteral;
0790         // eat '[':
0791         scursor++;
0792         while (parseGenericQuotedString(scursor, send, maybeDomainLiteral,
0793                                         isCRLF, '[', ']')) {
0794             if (scursor == send) {
0795                 // end of header: check for closing ']':
0796                 if (*(scursor - 1) == ']') {
0797                     // OK, last char was ']':
0798                     result = maybeDomainLiteral;
0799                     return true;
0800                 } else {
0801                     // not OK, domain-literal wasn't closed:
0802                     return false;
0803                 }
0804             }
0805             // we hit openChar in parseGenericQuotedString.
0806             // include it in maybeDomainLiteral and keep on parsing:
0807             if (*(scursor - 1) == '[') {
0808                 maybeDomainLiteral += QLatin1Char('[');
0809                 continue;
0810             }
0811             // OK, real end of domain-literal:
0812             result = maybeDomainLiteral;
0813             return true;
0814         }
0815     } else {
0816         // dot-atom:
0817         QByteArray maybeDotAtom;
0818         if (parseDotAtom(scursor, send, maybeDotAtom, isCRLF)) {
0819             // Domain may end with '.', if so preserve it'
0820             if (scursor != send && *scursor == '.') {
0821                 maybeDotAtom += '.';
0822                 scursor++;
0823             }
0824             result = QString::fromLatin1(maybeDotAtom);
0825             return true;
0826         }
0827     }
0828     return false;
0829 }
0830
0831 bool parseObsRoute(const char *&scursor, const char *const send,
0832                    QStringList &result, bool isCRLF, bool save)
0833 {
0834     while (scursor != send) {
0835         eatCFWS(scursor, send, isCRLF);
0836         if (scursor == send) {
0837             return false;
0838         }
0839
0840         // empty entry:
0841         if (*scursor == ',') {
0842             scursor++;
0843             if (save) {
0844                 result.append(QString());
0845             }
0846             continue;
0847         }
0848
0849         // empty entry ending the list:
0850         if (*scursor == ':') {
0851             scursor++;
0852             if (save) {
0853                 result.append(QString());
0854             }
0855             return true;
0856         }
0857
0858         // each non-empty entry must begin with '@':
0859         if (*scursor != '@') {
0860             return false;
0861         } else {
0862             scursor++;
0863         }
0864
0865         QString maybeDomain;
0866         if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
0867             return false;
0868         }
0869         if (save) {
0870             result.append(maybeDomain);
0871         }
0872
0873         // eat the following (optional) comma:
0874         eatCFWS(scursor, send, isCRLF);
0875         if (scursor == send) {
0876             return false;
0877         }
0878         if (*scursor == ':') {
0879             scursor++;
0880             return true;
0881         }
0882         if (*scursor == ',') {
0883             scursor++;
0884         }
0885     }
0886
0887     return false;
0888 }
0889
0890 bool parseAddrSpec(const char *&scursor, const char *const send,
0891                    AddrSpec &result, bool isCRLF)
0892 {
0893     //
0894     // STEP 1:
0895     // local-part := dot-atom / quoted-string / word *("." word)
0896     //
0897     // this is equivalent to:
0898     // local-part := word *("." word)
0899
0900     QString maybeLocalPart;
0901     QString tmp;
0902     QPair<const char *, int> tmpAtom;
0903
0904     while (scursor != send) {
0905         // first, eat any whitespace
0906         eatCFWS(scursor, send, isCRLF);
0907
0908         char ch = *scursor++;
0909         switch (ch) {
0910         case '.': // dot
0911             maybeLocalPart += QLatin1Char('.');
0912             break;
0913
0914         case '@':
0915             goto SAW_AT_SIGN;
0916             break;
0917
0918         case '"': // quoted-string
0919             tmp.clear();
0920             if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) {
0921                 maybeLocalPart += tmp;
0922             } else {
0923                 return false;
0924             }
0925             break;
0926
0927         default: // atom
0928             scursor--; // re-set scursor to point to ch again
0929             if (parseAtom(scursor, send, tmpAtom, false /* no 8bit */)) {
0930                 maybeLocalPart += QLatin1String(tmpAtom.first, tmpAtom.second);
0931             } else {
0932                 return false; // parseAtom can only fail if the first char is non-atext.
0933             }
0934             break;
0935         }
0936     }
0937
0938     return false;
0939
0940     //
0941     // STEP 2:
0942     // domain
0943     //
0944
0945 SAW_AT_SIGN:
0946
0947     assert(*(scursor - 1) == '@');
0948
0949     QString maybeDomain;
0950     if (!parseDomain(scursor, send, maybeDomain, isCRLF)) {
0951         return false;
0952     }
0953
0954     result.localPart = maybeLocalPart;
0955     result.domain = maybeDomain;
0956
0957     return true;
0958 }
0959
0960 bool parseAngleAddr(const char *&scursor, const char *const send,
0961                     AddrSpec &result, bool isCRLF)
0962 {
0963     // first, we need an opening angle bracket:
0964     eatCFWS(scursor, send, isCRLF);
0965     if (scursor == send || *scursor != '<') {
0966         return false;
0967     }
0968     scursor++; // eat '<'
0969
0970     eatCFWS(scursor, send, isCRLF);
0971     if (scursor == send) {
0972         return false;
0973     }
0974
0975     if (*scursor == '@' || *scursor == ',') {
0976         // obs-route: parse, but ignore:
0977         KMIME_WARN << "obsolete source route found! ignoring.";
0978         QStringList dummy;
0979         if (!parseObsRoute(scursor, send, dummy,
0980                            isCRLF, false /* don't save */)) {
0981             return false;
0982         }
0983         // angle-addr isn't complete until after the '>':
0984         if (scursor == send) {
0985             return false;
0986         }
0987     }
0988
0989     // parse addr-spec:
0990     AddrSpec maybeAddrSpec;
0991     if (!parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
0992         return false;
0993     }
0994
0995     eatCFWS(scursor, send, isCRLF);
0996     if (scursor == send || *scursor != '>') {
0997         return false;
0998     }
0999     scursor++;
1000
1001     result = maybeAddrSpec;
1002     return true;
1003
1004 }
1005
1006 static QString stripQuotes(const QString &input)
1007 {
1008     const QLatin1Char quotes('"');
1009     if (input.startsWith(quotes) && input.endsWith(quotes)) {
1010         QString stripped(input.mid(1, input.size() - 2));
1011         return stripped;
1012     } else {
1013         return input;
1014     }
1015 }
1016
1017 bool parseMailbox(const char *&scursor, const char *const send,
1018                   Mailbox &result, bool isCRLF)
1019 {
1020     eatCFWS(scursor, send, isCRLF);
1021     if (scursor == send) {
1022         return false;
1023     }
1024
1025     AddrSpec maybeAddrSpec;
1026     QString maybeDisplayName;
1027
1028     // first, try if it's a vanilla addr-spec:
1029     const char *oldscursor = scursor;
1030     if (parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) {
1031         result.setAddress(maybeAddrSpec);
1032         // check for the obsolete form of display-name (as comment):
1033         eatWhiteSpace(scursor, send);
1034         if (scursor != send && *scursor == '(') {
1035             scursor++;
1036             if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1037                 return false;
1038             }
1039         }
1040         result.setName(stripQuotes(maybeDisplayName));
1041         return true;
1042     }
1043     scursor = oldscursor;
1044
1045     // second, see if there's a display-name:
1046     if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1047         // failed: reset cursor, note absent display-name
1048         maybeDisplayName.clear();
1049         scursor = oldscursor;
1050     } else {
1051         // succeeded: eat CFWS
1052         eatCFWS(scursor, send, isCRLF);
1053         if (scursor == send) {
1054             return false;
1055         }
1056     }
1057
1058     // third, parse the angle-addr:
1059     if (!parseAngleAddr(scursor, send, maybeAddrSpec, isCRLF)) {
1060         return false;
1061     }
1062
1063     if (maybeDisplayName.isNull()) {
1064         // check for the obsolete form of display-name (as comment):
1065         eatWhiteSpace(scursor, send);
1066         if (scursor != send && *scursor == '(') {
1067             scursor++;
1068             if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) {
1069                 return false;
1070             }
1071         }
1072     }
1073
1074     result.setName(stripQuotes(maybeDisplayName));
1075     result.setAddress(maybeAddrSpec);
1076     return true;
1077 }
1078
1079 bool parseGroup(const char *&scursor, const char *const send,
1080                 Address &result, bool isCRLF)
1081 {
1082     // group         := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS]
1083     //
1084     // equivalent to:
1085     // group   := display-name ":" [ obs-mbox-list ] ";"
1086
1087     eatCFWS(scursor, send, isCRLF);
1088     if (scursor == send) {
1089         return false;
1090     }
1091
1092     // get display-name:
1093     QString maybeDisplayName;
1094     if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) {
1095         return false;
1096     }
1097
1098     // get ":":
1099     eatCFWS(scursor, send, isCRLF);
1100     if (scursor == send || *scursor != ':') {
1101         return false;
1102     }
1103
1104     // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that
1105     //            automatically calls removeBidiControlChars
1106     result.displayName = removeBidiControlChars(maybeDisplayName);
1107
1108     // get obs-mbox-list (may contain empty entries):
1109     scursor++;
1110     while (scursor != send) {
1111         eatCFWS(scursor, send, isCRLF);
1112         if (scursor == send) {
1113             return false;
1114         }
1115
1116         // empty entry:
1117         if (*scursor == ',') {
1118             scursor++;
1119             continue;
1120         }
1121
1122         // empty entry ending the list:
1123         if (*scursor == ';') {
1124             scursor++;
1125             return true;
1126         }
1127
1128         Mailbox maybeMailbox;
1129         if (!parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1130             return false;
1131         }
1132         result.mailboxList.append(maybeMailbox);
1133
1134         eatCFWS(scursor, send, isCRLF);
1135         // premature end:
1136         if (scursor == send) {
1137             return false;
1138         }
1139         // regular end of the list:
1140         if (*scursor == ';') {
1141             scursor++;
1142             return true;
1143         }
1144         // eat regular list entry separator:
1145         if (*scursor == ',') {
1146             scursor++;
1147         }
1148     }
1149     return false;
1150 }
1151
1152 bool parseAddress(const char *&scursor, const char *const send,
1153                   Address &result, bool isCRLF)
1154 {
1155     // address       := mailbox / group
1156
1157     eatCFWS(scursor, send, isCRLF);
1158     if (scursor == send) {
1159         return false;
1160     }
1161
1162     // first try if it's a single mailbox:
1163     Mailbox maybeMailbox;
1164     const char *oldscursor = scursor;
1165     if (parseMailbox(scursor, send, maybeMailbox, isCRLF)) {
1166         // yes, it is:
1167         result.displayName.clear();
1168         result.mailboxList.append(maybeMailbox);
1169         return true;
1170     }
1171     scursor = oldscursor;
1172
1173     Address maybeAddress;
1174
1175     // no, it's not a single mailbox. Try if it's a group:
1176     if (!parseGroup(scursor, send, maybeAddress, isCRLF)) {
1177         return false;
1178     }
1179
1180     result = maybeAddress;
1181     return true;
1182 }
1183
1184 bool parseAddressList(const char *&scursor, const char *const send,
1185                       AddressList &result, bool isCRLF)
1186 {
1187     while (scursor != send) {
1188         eatCFWS(scursor, send, isCRLF);
1189         // end of header: this is OK.
1190         if (scursor == send) {
1191             return true;
1192         }
1193         // empty entry: ignore:
1194         if (*scursor == ',') {
1195             scursor++;
1196             continue;
1197         }
1198         // broken clients might use ';' as list delimiter, accept that as well
1199         if (*scursor == ';') {
1200             scursor++;
1201             continue;
1202         }
1203
1204         // parse one entry
1205         Address maybeAddress;
1206         if (!parseAddress(scursor, send, maybeAddress, isCRLF)) {
1207             return false;
1208         }
1209         result.append(maybeAddress);
1210
1211         eatCFWS(scursor, send, isCRLF);
1212         // end of header: this is OK.
1213         if (scursor == send) {
1214             return true;
1215         }
1216         // comma separating entries: eat it.
1217         if (*scursor == ',') {
1218             scursor++;
1219         }
1220     }
1221     return true;
1222 }
1223
1224 static bool parseParameter(const char *&scursor, const char *const send,
1225                            QPair<QString, QStringOrQPair> &result, bool isCRLF)
1226 {
1227     // parameter = regular-parameter / extended-parameter
1228     // regular-parameter = regular-parameter-name "=" value
1229     // extended-parameter =
1230     // value = token / quoted-string
1231     //
1232     // note that rfc2231 handling is out of the scope of this function.
1233     // Therefore we return the attribute as QByteArray and the value as
1234     // (start,length) tuple if we see that the value is encoded
1235     // (trailing asterisk), for parseParameterList to decode...
1236
1237     eatCFWS(scursor, send, isCRLF);
1238     if (scursor == send) {
1239         return false;
1240     }
1241
1242     //
1243     // parse the parameter name:
1244     //
1245     QByteArray tmpAttr;
1246     if (!parseToken(scursor, send, tmpAttr, ParseTokenNoFlag)) {
1247         return false;
1248     }
1249     // FIXME: we could use QMap<QByteArray, ...> in the API for parameters
1250     QString maybeAttribute = QString::fromLatin1(tmpAttr);
1251
1252     eatCFWS(scursor, send, isCRLF);
1253     // premature end: not OK (haven't seen '=' yet).
1254     if (scursor == send || *scursor != '=') {
1255         return false;
1256     }
1257     scursor++; // eat '='
1258
1259     eatCFWS(scursor, send, isCRLF);
1260     if (scursor == send) {
1261         // don't choke on attribute=, meaning the value was omitted:
1262         if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1263             KMIME_WARN << "attribute ends with \"*\", but value is empty!"
1264                        "Chopping away \"*\".";
1265             maybeAttribute.chop(1);
1266         }
1267         result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1268         return true;
1269     }
1270
1271     const char *oldscursor = scursor;
1272
1273     //
1274     // parse the parameter value:
1275     //
1276     QStringOrQPair maybeValue;
1277     if (*scursor == '"') {
1278         // value is a quoted-string:
1279         scursor++;
1280         if (maybeAttribute.endsWith(QLatin1Char('*'))) {
1281             // attributes ending with "*" designate extended-parameters,
1282             // which cannot have quoted-strings as values. So we remove the
1283             // trailing "*" to not confuse upper layers.
1284             KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!"
1285                        "Chopping away \"*\".";
1286             maybeAttribute.chop(1);
1287         }
1288
1289         if (!parseGenericQuotedString(scursor, send, maybeValue.qstring, isCRLF)) {
1290             scursor = oldscursor;
1291             result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1292             return false; // this case needs further processing by upper layers!!
1293         }
1294     } else {
1295         // value is a token:
1296         if (!parseToken(scursor, send, maybeValue.qpair, ParseTokenRelaxedTText)) {
1297             scursor = oldscursor;
1298             result = qMakePair(maybeAttribute.toLower(), QStringOrQPair());
1299             return false; // this case needs further processing by upper layers!!
1300         }
1301     }
1302
1303     result = qMakePair(maybeAttribute.toLower(), maybeValue);
1304     return true;
1305 }
1306
1307 static bool parseRawParameterList(const char *&scursor, const char *const send,
1308                                   QMap<QString, QStringOrQPair> &result,
1309                                   bool isCRLF)
1310 {
1311     // we use parseParameter() consecutively to obtain a map of raw
1312     // attributes to raw values. "Raw" here means that we don't do
1313     // rfc2231 decoding and concatenation. This is left to
1314     // parseParameterList(), which will call this function.
1315     //
1316     // The main reason for making this chunk of code a separate
1317     // (private) method is that we can deal with broken parameters
1318     // _here_ and leave the rfc2231 handling solely to
1319     // parseParameterList(), which will still be enough work.
1320     while (scursor != send) {
1321         eatCFWS(scursor, send, isCRLF);
1322         // empty entry ending the list: OK.
1323         if (scursor == send) {
1324             return true;
1325         }
1326         // empty list entry: ignore.
1327         if (*scursor == ';') {
1328             scursor++;
1329             continue;
1330         }
1331         QPair<QString, QStringOrQPair> maybeParameter;
1332         if (!parseParameter(scursor, send, maybeParameter, isCRLF)) {
1333             // we need to do a bit of work if the attribute is not
1334             // NULL. These are the cases marked with "needs further
1335             // processing" in parseParameter(). Specifically, parsing of the
1336             // token or the quoted-string, which should represent the value,
1337             // failed. We take the easy way out and simply search for the
1338             // next ';' to start parsing again. (Another option would be to
1339             // take the text between '=' and ';' as value)
1340             if (maybeParameter.first.isNull()) {
1341                 return false;
1342             }
1343             while (scursor != send) {
1344                 if (*scursor++ == ';') {
1345                     goto IS_SEMICOLON;
1346                 }
1347             }
1348             // scursor == send case: end of list.
1349             return true;
1350         IS_SEMICOLON:
1351             // *scursor == ';' case: parse next entry.
1352             continue;
1353         }
1354         // successful parsing brings us here:
1355         result.insert(maybeParameter.first, maybeParameter.second);
1356
1357         eatCFWS(scursor, send, isCRLF);
1358         // end of header: ends list.
1359         if (scursor == send) {
1360             return true;
1361         }
1362         // regular separator: eat it.
1363         if (*scursor == ';') {
1364             scursor++;
1365         }
1366     }
1367     return true;
1368 }
1369
1370 static void decodeRFC2231Value(KCodecs::Codec *&rfc2231Codec,
1371                                QStringDecoder &textcodec,
1372                                bool isContinuation, QString &value,
1373                                QPair<const char *, int> &source, QByteArray &charset)
1374 {
1375     //
1376     // parse the raw value into (charset,language,text):
1377     //
1378
1379     const char *decBegin = source.first;
1380     const char *decCursor = decBegin;
1381     const char *decEnd = decCursor + source.second;
1382
1383     if (!isContinuation) {
1384         // find the first single quote
1385         while (decCursor != decEnd) {
1386             if (*decCursor == '\'') {
1387                 break;
1388             } else {
1389                 decCursor++;
1390             }
1391         }
1392
1393         if (decCursor == decEnd) {
1394             // there wasn't a single single quote at all!
1395             // take the whole value to be in latin-1:
1396             KMIME_WARN << "No charset in extended-initial-value."
1397                        "Assuming \"iso-8859-1\".";
1398             value += QString::fromLatin1(decBegin, source.second);
1399             return;
1400         }
1401
1402         charset = QByteArray(decBegin, decCursor - decBegin);
1403
1404         const char *oldDecCursor = ++decCursor;
1405         // find the second single quote (we ignore the language tag):
1406         while (decCursor != decEnd) {
1407             if (*decCursor == '\'') {
1408                 break;
1409             } else {
1410                 decCursor++;
1411             }
1412         }
1413         if (decCursor == decEnd) {
1414             KMIME_WARN << "No language in extended-initial-value."
1415                        "Trying to recover.";
1416             decCursor = oldDecCursor;
1417         } else {
1418             decCursor++;
1419         }
1420
1421         // decCursor now points to the start of the
1422         // "extended-other-values":
1423
1424         //
1425         // get the decoders:
1426         //
1427
1428         textcodec = QStringDecoder(charset.constData());
1429         if (!textcodec.isValid()) {
1430             KMIME_WARN_UNKNOWN(Charset, charset);
1431         }
1432     }
1433
1434     if (!rfc2231Codec) {
1435         rfc2231Codec = KCodecs::Codec::codecForName("x-kmime-rfc2231");
1436         assert(rfc2231Codec);
1437     }
1438
1439     if (!textcodec.isValid()) {
1440         value += QString::fromLatin1(decCursor, decEnd - decCursor);
1441         return;
1442     }
1443
1444     KCodecs::Decoder *dec = rfc2231Codec->makeDecoder();
1445     assert(dec);
1446
1447     //
1448     // do the decoding:
1449     //
1450
1451     QByteArray buffer;
1452     buffer.resize(rfc2231Codec->maxDecodedSizeFor(decEnd - decCursor));
1453     QByteArray::Iterator bit = buffer.begin();
1454     QByteArray::ConstIterator bend = buffer.end();
1455
1456     if (!dec->decode(decCursor, decEnd, bit, bend)) {
1457         KMIME_WARN << rfc2231Codec->name()
1458                    << "codec lies about its maxDecodedSizeFor()"
1459                    << Qt::endl
1460                    << "result may be truncated";
1461     }
1462
1463     value += textcodec.decode(QByteArrayView(buffer.begin(), bit - buffer.begin()));
1464
1465     // qCDebug(KMIME_LOG) << "value now: \"" << value << "\"";
1466     // cleanup:
1467     delete dec;
1468 }
1469
1470 // known issues:
1471 //  - permutes rfc2231 continuations when the total number of parts
1472 //    exceeds 10 (other-sections then becomes *xy, ie. two digits)
1473
1474 bool parseParameterListWithCharset(const char *&scursor,
1475                                    const char *const send,
1476                                    QMap<QString, QString> &result,
1477                                    QByteArray &charset, bool isCRLF)
1478 {
1479 // parse the list into raw attribute-value pairs:
1480     QMap<QString, QStringOrQPair> rawParameterList;
1481     if (!parseRawParameterList(scursor, send, rawParameterList, isCRLF)) {
1482         return false;
1483     }
1484
1485     if (rawParameterList.isEmpty()) {
1486         return true;
1487     }
1488
1489     // decode rfc 2231 continuations and alternate charset encoding:
1490
1491     // NOTE: this code assumes that what QMapIterator delivers is sorted
1492     // by the key!
1493
1494     KCodecs::Codec *rfc2231Codec = nullptr;
1495     QStringDecoder textcodec;
1496     QString attribute;
1497     QString value;
1498     enum Mode {
1499         NoMode = 0x0, Continued = 0x1, Encoded = 0x2
1500     };
1501
1502     enum EncodingMode {
1503         NoEncoding,
1504         RFC2047,
1505         RFC2231
1506     };
1507
1508     QMap<QString, QStringOrQPair>::Iterator it;
1509     QMap<QString, QStringOrQPair>::Iterator end = rawParameterList.end();
1510
1511     for (it = rawParameterList.begin() ; it != end ; ++it) {
1512         if (attribute.isNull() || !it.key().startsWith(attribute)) {
1513             //
1514             // new attribute:
1515             //
1516
1517             // store the last attribute/value pair in the result map now:
1518             if (!attribute.isNull()) {
1519                 result.insert(attribute, value);
1520             }
1521             // and extract the information from the new raw attribute:
1522             value.clear();
1523             attribute = it.key();
1524             int mode = NoMode;
1525             EncodingMode encodingMode = NoEncoding;
1526
1527             // is the value rfc2331-encoded?
1528             if (attribute.endsWith(QLatin1Char('*'))) {
1529                 attribute.chop(1);
1530                 mode |= Encoded;
1531                 encodingMode = RFC2231;
1532             }
1533             // is the value rfc2047-encoded?
1534             if (!(*it).qstring.isNull() && (*it).qstring.contains(QLatin1String("=?"))) {
1535                 mode |= Encoded;
1536                 encodingMode = RFC2047;
1537             }
1538             // is the value continued?
1539             if (attribute.endsWith(QLatin1String("*0"))) {
1540                 attribute.chop(2);
1541                 mode |= Continued;
1542             }
1543             //
1544             // decode if necessary:
1545             //
1546             if (mode & Encoded) {
1547                 if (encodingMode == RFC2231) {
1548                     decodeRFC2231Value(rfc2231Codec, textcodec,
1549                                        false, /* isn't continuation */
1550                                        value, (*it).qpair, charset);
1551                 } else if (encodingMode == RFC2047) {
1552                     value += KCodecs::decodeRFC2047String((*it).qstring.toLatin1(), &charset);
1553                 }
1554             } else {
1555                 // not encoded.
1556                 if ((*it).qpair.first) {
1557                     value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1558                 } else {
1559                     value += (*it).qstring;
1560                 }
1561             }
1562
1563             //
1564             // shortcut-processing when the value isn't encoded:
1565             //
1566
1567             if (!(mode & Continued)) {
1568                 // save result already:
1569                 result.insert(attribute, value);
1570                 // force begin of a new attribute:
1571                 attribute.clear();
1572             }
1573         } else { // it.key().startsWith( attribute )
1574             //
1575             // continuation
1576             //
1577
1578             // ignore the section and trust QMap to have sorted the keys:
1579             if (it.key().endsWith(QLatin1Char('*'))) {
1580                 // encoded
1581                 decodeRFC2231Value(rfc2231Codec, textcodec,
1582                                    true, /* is continuation */
1583                                    value, (*it).qpair, charset);
1584             } else {
1585                 // not encoded
1586                 if ((*it).qpair.first) {
1587                     value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second);
1588                 } else {
1589                     value += (*it).qstring;
1590                 }
1591             }
1592         }
1593     }
1594     // write last attr/value pair:
1595     if (!attribute.isNull()) {
1596         result.insert(attribute, value);
1597     }
1598
1599     return true;
1600 }
1601
1602 bool parseParameterList(const char *&scursor, const char *const send,
1603                         QMap<QString, QString> &result, bool isCRLF)
1604 {
1605     QByteArray charset;
1606     return parseParameterListWithCharset(scursor, send, result, charset, isCRLF);
1607 }
1608
1609 static const char stdDayNames[][4] = {
1610     "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
1611 };
1612 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames;
1613
1614 static bool parseDayName(const char *&scursor, const char *const send)
1615 {
1616     // check bounds:
1617     if (send - scursor < 3) {
1618         return false;
1619     }
1620
1621     for (int i = 0 ; i < stdDayNamesLen ; ++i) {
1622         if (qstrnicmp(scursor, stdDayNames[i], 3) == 0) {
1623             scursor += 3;
1624             // qCDebug(KMIME_LOG) << "found" << stdDayNames[i];
1625             return true;
1626         }
1627     }
1628
1629     return false;
1630 }
1631
1632 static const char stdMonthNames[][4] = {
1633     "Jan", "Feb", "Mar", "Apr", "May", "Jun",
1634     "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
1635 };
1636 static const int stdMonthNamesLen =
1637     sizeof stdMonthNames / sizeof *stdMonthNames;
1638
1639 static bool parseMonthName(const char *&scursor, const char *const send,
1640                            int &result)
1641 {
1642     // check bounds:
1643     if (send - scursor < 3) {
1644         return false;
1645     }
1646
1647     for (result = 0 ; result < stdMonthNamesLen ; ++result) {
1648         if (qstrnicmp(scursor, stdMonthNames[result], 3) == 0) {
1649             scursor += 3;
1650             return true;
1651         }
1652     }
1653
1654     // not found:
1655     return false;
1656 }
1657
1658 static const struct {
1659     const char tzName[5];
1660     long int secsEastOfGMT;
1661 } timeZones[] = {
1662     // rfc 822 timezones:
1663     { "GMT", 0 },
1664     { "UT", 0 },
1665     { "EDT", -4 * 3600 },
1666     { "EST", -5 * 3600 },
1667     { "MST", -5 * 3600 },
1668     { "CST", -6 * 3600 },
1669     { "MDT", -6 * 3600 },
1670     { "MST", -7 * 3600 },
1671     { "PDT", -7 * 3600 },
1672     { "PST", -8 * 3600 },
1673     // common, non-rfc-822 zones:
1674     { "CET", 1 * 3600 },
1675     { "MET", 1 * 3600 },
1676     { "UTC", 0 },
1677     { "CEST", 2 * 3600 },
1678     { "BST", 1 * 3600 },
1679     // rfc 822 military timezones:
1680     { "Z", 0 },
1681     { "A", -1 * 3600 },
1682     { "B", -2 * 3600 },
1683     { "C", -3 * 3600 },
1684     { "D", -4 * 3600 },
1685     { "E", -5 * 3600 },
1686     { "F", -6 * 3600 },
1687     { "G", -7 * 3600 },
1688     { "H", -8 * 3600 },
1689     { "I", -9 * 3600 },
1690     // J is not used!
1691     { "K", -10 * 3600 },
1692     { "L", -11 * 3600 },
1693     { "M", -12 * 3600 },
1694     { "N", 1 * 3600 },
1695     { "O", 2 * 3600 },
1696     { "P", 3 * 3600 },
1697     { "Q", 4 * 3600 },
1698     { "R", 5 * 3600 },
1699     { "S", 6 * 3600 },
1700     { "T", 7 * 3600 },
1701     { "U", 8 * 3600 },
1702     { "V", 9 * 3600 },
1703     { "W", 10 * 3600 },
1704     { "X", 11 * 3600 },
1705     { "Y", 12 * 3600 },
1706 };
1707 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones;
1708
1709 static bool parseAlphaNumericTimeZone(const char *&scursor,
1710                                       const char *const send,
1711                                       long int &secsEastOfGMT,
1712                                       bool &timeZoneKnown)
1713 {
1714     // allow the timezone to be wrapped in quotes; bug 260761
1715     if (scursor < send && *scursor == '"') {
1716         scursor++;
1717
1718         if (scursor == send) {
1719             return false;
1720         }
1721     }
1722
1723     QPair<const char *, int> maybeTimeZone(nullptr, 0);
1724     if (!parseToken(scursor, send, maybeTimeZone, ParseTokenNoFlag)) {
1725         return false;
1726     }
1727     for (int i = 0 ; i < timeZonesLen ; ++i) {
1728         if (qstrnicmp(timeZones[i].tzName,
1729                       maybeTimeZone.first, maybeTimeZone.second) == 0) {
1730             scursor += maybeTimeZone.second;
1731             secsEastOfGMT = timeZones[i].secsEastOfGMT;
1732             timeZoneKnown = true;
1733
1734             if (scursor < send && *scursor == '"') {
1735                 scursor++;
1736             }
1737
1738             return true;
1739         }
1740     }
1741
1742     // don't choke just because we don't happen to know the time zone
1743     KMIME_WARN_UNKNOWN(time zone,
1744                        QByteArray(maybeTimeZone.first, maybeTimeZone.second));
1745     secsEastOfGMT = 0;
1746     timeZoneKnown = false;
1747     return true;
1748 }
1749
1750 // parse a number and return the number of digits parsed:
1751 int parseDigits(const char *&scursor, const char *const send, int &result)
1752 {
1753     result = 0;
1754     int digits = 0;
1755     for (; scursor != send && isdigit(*scursor) ; scursor++, digits++) {
1756         result *= 10;
1757         result += int(*scursor - '0');
1758     }
1759     return digits;
1760 }
1761
1762 static bool parseTimeOfDay(const char *&scursor, const char *const send,
1763                            int &hour, int &min, int &sec, bool isCRLF = false)
1764 {
1765     // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ]
1766
1767     //
1768     // 2DIGIT representing "hour":
1769     //
1770     if (!parseDigits(scursor, send, hour)) {
1771         return false;
1772     }
1773
1774     eatCFWS(scursor, send, isCRLF);
1775     if (scursor == send || *scursor != ':') {
1776         return false;
1777     }
1778     scursor++; // eat ':'
1779
1780     eatCFWS(scursor, send, isCRLF);
1781     if (scursor == send) {
1782         return false;
1783     }
1784
1785     //
1786     // 2DIGIT representing "minute":
1787     //
1788     if (!parseDigits(scursor, send, min)) {
1789         return false;
1790     }
1791
1792     eatCFWS(scursor, send, isCRLF);
1793     if (scursor == send) {
1794         return true; // seconds are optional
1795     }
1796
1797     //
1798     // let's see if we have a 2DIGIT representing "second":
1799     //
1800     if (*scursor == ':') {
1801         // yepp, there are seconds:
1802         scursor++; // eat ':'
1803         eatCFWS(scursor, send, isCRLF);
1804         if (scursor == send) {
1805             return false;
1806         }
1807
1808         if (!parseDigits(scursor, send, sec)) {
1809             return false;
1810         }
1811     } else {
1812         sec = 0;
1813     }
1814
1815     return true;
1816 }
1817
1818 bool parseTime(const char *&scursor, const char *send,
1819                int &hour, int &min, int &sec, long int &secsEastOfGMT,
1820                bool &timeZoneKnown, bool isCRLF)
1821 {
1822     // time := time-of-day CFWS ( zone / obs-zone )
1823     //
1824     // obs-zone    := "UT" / "GMT" /
1825     //                "EST" / "EDT" / ; -0500 / -0400
1826     //                "CST" / "CDT" / ; -0600 / -0500
1827     //                "MST" / "MDT" / ; -0700 / -0600
1828     //                "PST" / "PDT" / ; -0800 / -0700
1829     //                "A"-"I" / "a"-"i" /
1830     //                "K"-"Z" / "k"-"z"
1831
1832     eatCFWS(scursor, send, isCRLF);
1833     if (scursor == send) {
1834         return false;
1835     }
1836
1837     if (!parseTimeOfDay(scursor, send, hour, min, sec, isCRLF)) {
1838         return false;
1839     }
1840
1841     eatCFWS(scursor, send, isCRLF);
1842     // there might be no timezone but a year following
1843     if ((scursor == send) || isdigit(*scursor)) {
1844         timeZoneKnown = false;
1845         secsEastOfGMT = 0;
1846         return true; // allow missing timezone
1847     }
1848
1849     timeZoneKnown = true;
1850     if (*scursor == '+' || *scursor == '-') {
1851         // remember and eat '-'/'+':
1852         const char sign = *scursor++;
1853         // numerical timezone:
1854         int maybeTimeZone;
1855         const int tzDigits = parseDigits(scursor, send, maybeTimeZone);
1856         if (tzDigits != 4) {
1857             // Allow timezones in 02:00 format
1858             if (tzDigits == 2 && scursor != send && *scursor == ':') {
1859                 scursor++;
1860                 int maybeTimeZone2;
1861                 if (parseDigits(scursor, send, maybeTimeZone2) != 2) {
1862                     return false;
1863                 }
1864                 maybeTimeZone = maybeTimeZone * 100 + maybeTimeZone2;
1865             } else {
1866                 return false;
1867             }
1868         }
1869         secsEastOfGMT = 60 * (maybeTimeZone / 100 * 60 + maybeTimeZone % 100);
1870         if (sign == '-') {
1871             secsEastOfGMT *= -1;
1872             if (secsEastOfGMT == 0) {
1873                 timeZoneKnown = false; // -0000 means indetermined tz
1874             }
1875         }
1876     } else {
1877         // maybe alphanumeric timezone:
1878         if (!parseAlphaNumericTimeZone(scursor, send, secsEastOfGMT, timeZoneKnown)) {
1879             return false;
1880         }
1881     }
1882     return true;
1883 }
1884
1885 bool parseQDateTime(const char *&scursor, const char *const send,
1886                    QDateTime &result, bool isCRLF)
1887 {
1888     eatCFWS(scursor, send, isCRLF);
1889     if (scursor == send) {
1890         return false;
1891     }
1892     // In qt6 yy == 1900 ! => for sure we use 2000 here.
1893     result = QDateTime::fromString(QString::fromLatin1(scursor, 17), QStringLiteral("dd/MM/yy HH:mm:ss"));
1894     QDate resultDate = result.date();
1895     resultDate.setDate(resultDate.year() + 100, resultDate.month(), resultDate.day());
1896     result.setDate(resultDate);
1897     return result.isValid();
1898 }
1899
1900 bool parseDateTime(const char *&scursor, const char *const send,
1901                    QDateTime &result, bool isCRLF)
1902 {
1903     // Parsing date-time; strict mode:
1904     //
1905     // date-time   := [ [CFWS] day-name [CFWS] "," ]                      ; wday
1906     // (expanded)     [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date
1907     //                time
1908     //
1909     // day-name    := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
1910     // month-name  := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" /
1911     //                "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec"
1912
1913     result = QDateTime();
1914
1915     eatCFWS(scursor, send, isCRLF);
1916     if (scursor == send) {
1917         return false;
1918     }
1919
1920     //
1921     // let's see if there's a day-of-week:
1922     //
1923     if (parseDayName(scursor, send)) {
1924         eatCFWS(scursor, send, isCRLF);
1925         if (scursor == send) {
1926             return false;
1927         }
1928         // day-name should be followed by ',' but we treat it as optional:
1929         if (*scursor == ',') {
1930             scursor++; // eat ','
1931             eatCFWS(scursor, send, isCRLF);
1932         }
1933     }
1934
1935     int maybeMonth = -1;
1936     bool asctimeFormat = false;
1937
1938     // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993
1939     if (!isdigit(*scursor) && parseMonthName(scursor, send, maybeMonth)) {
1940         asctimeFormat = true;
1941         eatCFWS(scursor, send, isCRLF);
1942     }
1943
1944     //
1945     // 1*2DIGIT representing "day" (of month):
1946     //
1947     int maybeDay;
1948     if (!parseDigits(scursor, send, maybeDay)) {
1949         return false;
1950     }
1951
1952     eatCFWS(scursor, send, isCRLF);
1953     if (scursor == send) {
1954         return false;
1955     }
1956
1957     // ignore ","; bug 54098
1958     if (*scursor == ',') {
1959         scursor++;
1960     }
1961
1962     //
1963     // month-name:
1964     //
1965     if (!asctimeFormat && !parseMonthName(scursor, send, maybeMonth)) {
1966         return false;
1967     }
1968     if (scursor == send) {
1969         return false;
1970     }
1971     assert(maybeMonth >= 0); assert(maybeMonth <= 11);
1972     ++maybeMonth; // 0-11 -> 1-12
1973
1974     eatCFWS(scursor, send, isCRLF);
1975     if (scursor == send) {
1976         return false;
1977     }
1978
1979     // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS")
1980     bool timeAfterYear = true;
1981     if ((send - scursor > 3) && ((scursor[1] == ':') || (scursor[2] == ':'))) {
1982         timeAfterYear = false;  // first read time, then year
1983     }
1984
1985     //
1986     // 2*DIGIT representing "year":
1987     //
1988     int maybeYear = 0;
1989
1990     if (timeAfterYear && !parseDigits(scursor, send, maybeYear)) {
1991         return false;
1992     }
1993
1994     eatCFWS(scursor, send, isCRLF);
1995     int maybeHour;
1996     int maybeMinute;
1997     int maybeSecond;
1998     long int secsEastOfGMT = 0;
1999     QDate maybeDate;
2000     QTime maybeTime;
2001     if (scursor != send) {
2002         //
2003         // time
2004         //
2005         bool timeZoneKnown = true;
2006
2007         if (!parseTime(scursor, send,
2008                        maybeHour, maybeMinute, maybeSecond,
2009                        secsEastOfGMT, timeZoneKnown, isCRLF)) {
2010             return false;
2011         }
2012
2013         // in asctime() the year follows the time
2014         if (!timeAfterYear) {
2015             eatCFWS(scursor, send, isCRLF);
2016             if (scursor == send) {
2017                 return false;
2018             }
2019
2020             if (!parseDigits(scursor, send, maybeYear)) {
2021                 return false;
2022             }
2023         }
2024
2025         // RFC 2822 4.3 processing:
2026         if (maybeYear < 50) {
2027             maybeYear += 2000;
2028         } else if (maybeYear < 1000) {
2029             maybeYear += 1900;
2030         }
2031         // else keep as is
2032         if (maybeYear < 1900) {
2033             return false; // rfc2822, 3.3
2034         }
2035
2036         maybeDate = QDate(maybeYear, maybeMonth, maybeDay);
2037         maybeTime = QTime(maybeHour, maybeMinute, maybeSecond);
2038
2039         if (!maybeDate.isValid() || !maybeTime.isValid()) {
2040             return false;
2041         }
2042     } else {
2043         maybeDate = QDate(maybeYear, maybeMonth, maybeDay);
2044         maybeTime = QTime(0, 0, 0);
2045     }
2046
2047     result = QDateTime(maybeDate, maybeTime, QTimeZone::fromSecondsAheadOfUtc(secsEastOfGMT));
2048     if (!result.isValid()) {
2049         return false;
2050     }
2051     return true;
2052 }
2053
2054 namespace {
2055
2056 Headers::Base *extractHeader(QByteArrayView head, const int headerStart, int &endOfFieldBody)
2057 {
2058     Headers::Base *header = {};
2059
2060     int startOfFieldBody = head.indexOf(':', headerStart);
2061     if (startOfFieldBody < 0) {
2062         return nullptr;
2063     }
2064
2065     const char *rawType = head.constData() + headerStart;
2066     const size_t rawTypeLen = startOfFieldBody - headerStart;
2067
2068     startOfFieldBody++; //skip the ':'
2069     if (startOfFieldBody < head.size() - 1 &&  head[startOfFieldBody] == ' ') { // skip the space after the ':', if there's any
2070         startOfFieldBody++;
2071     }
2072
2073     bool folded = false;
2074     endOfFieldBody = findHeaderLineEnd(head, startOfFieldBody, &folded);
2075
2076     // We might get an invalid mail without a field name, don't crash on that.
2077     if (rawTypeLen > 0) {
2078         header = HeaderFactory::createHeader(rawType, rawTypeLen);
2079     }
2080     if (!header) {
2081         //qCWarning(KMIME_LOG)() << "Returning Generic header of type" << rawType;
2082         header = new Headers::Generic(rawType, rawTypeLen);
2083     }
2084     if (folded) {
2085         const auto unfoldedBody = unfoldHeader(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2086         header->from7BitString(unfoldedBody);
2087     } else {
2088         header->from7BitString(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody);
2089     }
2090
2091     return header;
2092 }
2093
2094 }
2095
2096 std::unique_ptr<KMime::Headers::Base> parseNextHeader(QByteArrayView &head)
2097 {
2098     int endOfFieldBody = 0;
2099     std::unique_ptr<KMime::Headers::Base> header(extractHeader(head, 0, endOfFieldBody));
2100     if (header) {
2101         head = head.mid(endOfFieldBody + 1);
2102     } else {
2103         head = {};
2104     }
2105
2106     return header;
2107 }
2108
2109 void extractHeaderAndBody(const QByteArray &content, QByteArray &header, QByteArray &body)
2110 {
2111     header.clear();
2112     body.clear();
2113
2114     // empty header
2115     if (content.startsWith('\n')) {
2116         body = content.right(content.length() - 1);
2117         return;
2118     }
2119
2120     int pos = content.indexOf("\n\n", 0);
2121     if (pos > -1) {
2122         header = content.left(++pos);    //header *must* end with "\n" !!
2123         body = content.mid(pos + 1);
2124         if (body.startsWith("\n")) {
2125             body = "\n" + body;
2126         }
2127     } else {
2128         header = content;
2129     }
2130 }
2131
2132 QList<Headers::Base *> parseHeaders(const QByteArray &head) {
2133     QList<Headers::Base *> ret;
2134
2135     int cursor = 0;
2136     while (cursor < head.size()) {
2137         const int headerStart = cursor;
2138         int endOfFieldBody;
2139         if (auto header = extractHeader(head, headerStart, endOfFieldBody)) {
2140             ret << header;
2141             cursor = endOfFieldBody + 1;
2142         } else {
2143             break;
2144         }
2145     }
2146
2147     return ret;
2148 }
2149
2150 } // namespace HeaderParsing
2151
2152 } // namespace KMime