File indexing completed on 2024-04-28 09:12:09
0001 /* -*- c++ -*- 0002 kmime_header_parsing.cpp 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 0010 #include "kmime_header_parsing.h" 0011 0012 #include "kmime_headerfactory_p.h" 0013 #include "kmime_headers.h" 0014 #include "kmime_util.h" 0015 #include "kmime_util_p.h" 0016 #include "kmime_codecs_p.h" 0017 #include "kmime_dateformatter.h" 0018 #include "kmime_debug.h" 0019 #include "kmime_warning_p.h" 0020 0021 #include <KCodecs> 0022 0023 #include <QMap> 0024 #include <QStringDecoder> 0025 #include <QTimeZone> 0026 0027 #include <cassert> 0028 #include <cctype> // for isdigit 0029 0030 using namespace KMime; 0031 using namespace KMime::Types; 0032 0033 namespace KMime 0034 { 0035 0036 namespace Types 0037 { 0038 // Optimization to avoid allocating QStrings when the value isn't encoded 0039 struct KMIME_EXPORT QStringOrQPair { 0040 QStringOrQPair() : qstring(), qpair(nullptr, 0) {} 0041 QString qstring; 0042 QPair<const char *, int> qpair; 0043 }; 0044 } // namespace Types 0045 0046 namespace HeaderParsing 0047 { 0048 0049 // parse the encoded-word (scursor points to after the initial '=') 0050 bool parseEncodedWord(const char *&scursor, const char *const send, 0051 QString &result, QByteArray &language, 0052 QByteArray &usedCS, const QByteArray &defaultCS) 0053 { 0054 // make sure the caller already did a bit of the work. 0055 assert(*(scursor - 1) == '='); 0056 0057 // 0058 // STEP 1: 0059 // scan for the charset/language portion of the encoded-word 0060 // 0061 0062 char ch = *scursor++; 0063 0064 if (ch != '?') { 0065 // qCDebug(KMIME_LOG) << "first"; 0066 //KMIME_WARN_PREMATURE_END_OF( EncodedWord ); 0067 return false; 0068 } 0069 0070 // remember start of charset (ie. just after the initial "=?") and 0071 // language (just after the first '*') fields: 0072 const char *charsetStart = scursor; 0073 const char *languageStart = nullptr; 0074 0075 // find delimiting '?' (and the '*' separating charset and language 0076 // tags, if any): 0077 for (; scursor != send ; scursor++) { 0078 if (*scursor == '?') { 0079 break; 0080 } else if (*scursor == '*' && languageStart == nullptr) { 0081 languageStart = scursor + 1; 0082 } 0083 } 0084 0085 // not found? can't be an encoded-word! 0086 if (scursor == send || *scursor != '?') { 0087 // qCDebug(KMIME_LOG) << "second"; 0088 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0089 return false; 0090 } 0091 0092 // extract the language information, if any (if languageStart is 0, 0093 // language will be null, too): 0094 QByteArray maybeLanguage(languageStart, scursor - languageStart); 0095 // extract charset information (keep in mind: the size given to the 0096 // ctor is one off due to the \0 terminator): 0097 QByteArray maybeCharset(charsetStart, 0098 (languageStart ? languageStart - 1 : scursor) - charsetStart); 0099 0100 // 0101 // STEP 2: 0102 // scan for the encoding portion of the encoded-word 0103 // 0104 0105 // remember start of encoding (just _after_ the second '?'): 0106 scursor++; 0107 const char *encodingStart = scursor; 0108 0109 // find next '?' (ending the encoding tag): 0110 for (; scursor != send ; scursor++) { 0111 if (*scursor == '?') { 0112 break; 0113 } 0114 } 0115 0116 // not found? Can't be an encoded-word! 0117 if (scursor == send || *scursor != '?') { 0118 // qCDebug(KMIME_LOG) << "third"; 0119 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0120 return false; 0121 } 0122 0123 // extract the encoding information: 0124 QByteArray maybeEncoding(encodingStart, scursor - encodingStart); 0125 0126 // qCDebug(KMIME_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset 0127 // << "\"; language == \"" << maybeLanguage 0128 // << "\"; encoding == \"" << maybeEncoding << "\""; 0129 0130 // 0131 // STEP 3: 0132 // scan for encoded-text portion of encoded-word 0133 // 0134 0135 // remember start of encoded-text (just after the third '?'): 0136 scursor++; 0137 const char *encodedTextStart = scursor; 0138 0139 // find the '?=' sequence (ending the encoded-text): 0140 for (; scursor != send ; scursor++) { 0141 if (*scursor == '?') { 0142 if (scursor + 1 != send) { 0143 if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore 0144 KMIME_WARN << "Stray '?' in q-encoded word, ignoring this."; 0145 continue; 0146 } else { // yep, found a '?=' sequence 0147 scursor += 2; 0148 break; 0149 } 0150 } else { // The '?' is the last char, but we need a '=' after it! 0151 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0152 return false; 0153 } 0154 } 0155 } 0156 0157 if (*(scursor - 2) != '?' || *(scursor - 1) != '=' || 0158 scursor < encodedTextStart + 2) { 0159 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0160 return false; 0161 } 0162 0163 // set end sentinel for encoded-text: 0164 const char *const encodedTextEnd = scursor - 2; 0165 0166 // 0167 // STEP 4: 0168 // setup decoders for the transfer encoding and the charset 0169 // 0170 0171 // try if there's a codec for the encoding found: 0172 KCodecs::Codec *codec = KCodecs::Codec::codecForName(maybeEncoding); 0173 if (!codec) { 0174 KMIME_WARN_UNKNOWN(Encoding, maybeEncoding); 0175 return false; 0176 } 0177 0178 // get an instance of a corresponding decoder: 0179 KCodecs::Decoder *dec = codec->makeDecoder(); 0180 assert(dec); 0181 0182 // try if there's a (text)codec for the charset found: 0183 QStringDecoder textCodec; 0184 if (maybeCharset.isEmpty()) { 0185 textCodec = QStringDecoder(defaultCS.constData()); 0186 if (!textCodec.isValid()) { 0187 textCodec = QStringDecoder(QStringDecoder::Latin1); 0188 } 0189 usedCS = cachedCharset(defaultCS); 0190 } else { 0191 textCodec = QStringDecoder(maybeCharset.constData()); 0192 if (textCodec.isValid()) { //no suitable codec found => use default charset 0193 usedCS = cachedCharset(defaultCS); 0194 } else { 0195 textCodec = QStringDecoder(QStringDecoder::Latin1); 0196 usedCS = cachedCharset(maybeCharset); 0197 } 0198 } 0199 0200 if (!textCodec.isValid()) { 0201 KMIME_WARN_UNKNOWN(Charset, maybeCharset); 0202 delete dec; 0203 return false; 0204 }; 0205 0206 // qCDebug(KMIME_LOG) << "mimeName(): \"" << textCodec->name() << "\""; 0207 0208 // allocate a temporary buffer to store the 8bit text: 0209 int encodedTextLength = encodedTextEnd - encodedTextStart; 0210 QByteArray buffer; 0211 buffer.resize(codec->maxDecodedSizeFor(encodedTextLength)); 0212 char *bbegin = buffer.data(); 0213 char *bend = bbegin + buffer.length(); 0214 0215 // 0216 // STEP 5: 0217 // do the actual decoding 0218 // 0219 0220 if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) { 0221 KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor(" 0222 << encodedTextLength << ")\nresult may be truncated"; 0223 } 0224 0225 result = textCodec.decode(QByteArrayView(buffer.data(), bbegin - buffer.data())); 0226 0227 // qCDebug(KMIME_LOG) << "result now: \"" << result << "\""; 0228 // cleanup: 0229 delete dec; 0230 language = maybeLanguage; 0231 0232 return true; 0233 } 0234 0235 static inline void eatWhiteSpace(const char *&scursor, const char *const send) 0236 { 0237 while (scursor != send && 0238 (*scursor == ' ' || *scursor == '\n' || 0239 *scursor == '\t' || *scursor == '\r')) { 0240 scursor++; 0241 } 0242 } 0243 0244 bool parseAtom(const char*&scursor, const char *const send, 0245 QByteArray &result, bool allow8Bit) 0246 { 0247 QPair<const char *, int> maybeResult; 0248 0249 if (parseAtom(scursor, send, maybeResult, allow8Bit)) { 0250 result = QByteArray(maybeResult.first, maybeResult.second); 0251 return true; 0252 } 0253 0254 return false; 0255 } 0256 0257 bool parseAtom(const char*&scursor, const char *const send, 0258 QPair<const char *, int> &result, bool allow8Bit) 0259 { 0260 bool success = false; 0261 const char *start = scursor; 0262 0263 while (scursor != send) { 0264 signed char ch = *scursor++; 0265 if (ch > 0 && isAText(ch)) { 0266 // AText: OK 0267 success = true; 0268 } else if (allow8Bit && ch < 0) { 0269 // 8bit char: not OK, but be tolerant. 0270 KMIME_WARN_8BIT(ch); 0271 success = true; 0272 } else { 0273 // CTL or special - marking the end of the atom: 0274 // re-set sursor to point to the offending 0275 // char and return: 0276 scursor--; 0277 break; 0278 } 0279 } 0280 result.first = start; 0281 result.second = scursor - start; 0282 return success; 0283 } 0284 0285 bool parseToken(const char*&scursor, const char *const send, 0286 QByteArray &result, ParseTokenFlags flags) 0287 { 0288 QPair<const char *, int> maybeResult; 0289 0290 if (parseToken(scursor, send, maybeResult, flags)) { 0291 result = QByteArray(maybeResult.first, maybeResult.second); 0292 return true; 0293 } 0294 0295 return false; 0296 } 0297 0298 bool parseToken(const char*&scursor, const char *const send, 0299 QPair<const char *, int> &result, ParseTokenFlags flags) 0300 { 0301 bool success = false; 0302 const char *start = scursor; 0303 0304 while (scursor != send) { 0305 signed char ch = *scursor++; 0306 if (ch > 0 && isTText(ch)) { 0307 // TText: OK 0308 success = true; 0309 } else if ((flags & ParseTokenAllow8Bit) && ch < 0) { 0310 // 8bit char: not OK, but be tolerant. 0311 KMIME_WARN_8BIT(ch); 0312 success = true; 0313 } else if ((flags & ParseTokenRelaxedTText) && ch == '/') { 0314 success = true; 0315 } else { 0316 // CTL or tspecial - marking the end of the atom: 0317 // re-set sursor to point to the offending 0318 // char and return: 0319 scursor--; 0320 break; 0321 } 0322 } 0323 result.first = start; 0324 result.second = scursor - start; 0325 return success; 0326 } 0327 0328 #define READ_ch_OR_FAIL if ( scursor == send ) { \ 0329 KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \ 0330 return false; \ 0331 } else { \ 0332 ch = *scursor++; \ 0333 } 0334 0335 // known issues: 0336 // 0337 // - doesn't handle quoted CRLF 0338 0339 bool parseGenericQuotedString(const char *&scursor, const char *const send, 0340 QString &result, bool isCRLF, 0341 const char openChar, const char closeChar) 0342 { 0343 // We are in a quoted-string or domain-literal or comment and the 0344 // cursor points to the first char after the openChar. 0345 // We will apply unfolding and quoted-pair removal. 0346 // We return when we either encounter the end or unescaped openChar 0347 // or closeChar. 0348 assert(*(scursor - 1) == openChar || *(scursor - 1) == closeChar); 0349 0350 while (scursor != send) { 0351 char ch = *scursor++; 0352 0353 if (ch == closeChar || ch == openChar) { 0354 // end of quoted-string or another opening char: 0355 // let caller decide what to do. 0356 return true; 0357 } 0358 0359 switch (ch) { 0360 case '\\': // quoted-pair 0361 // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5 0362 READ_ch_OR_FAIL; 0363 KMIME_WARN_IF_8BIT(ch); 0364 result += QLatin1Char(ch); 0365 break; 0366 case '\r': 0367 // ### 0368 // The case of lonely '\r' is easy to solve, as they're 0369 // not part of Unix Line-ending conventions. 0370 // But I see a problem if we are given Unix-native 0371 // line-ending-mails, where we cannot determine anymore 0372 // whether a given '\n' was part of a CRLF or was occurring 0373 // on it's own. 0374 READ_ch_OR_FAIL; 0375 if (ch != '\n') { 0376 // CR on it's own... 0377 KMIME_WARN_LONE(CR); 0378 result += QLatin1Char('\r'); 0379 scursor--; // points to after the '\r' again 0380 } else { 0381 // CRLF encountered. 0382 // lookahead: check for folding 0383 READ_ch_OR_FAIL; 0384 if (ch == ' ' || ch == '\t') { 0385 // correct folding; 0386 // position cursor behind the CRLF WSP (unfolding) 0387 // and add the WSP to the result 0388 result += QLatin1Char(ch); 0389 } else { 0390 // this is the "shouldn't happen"-case. There is a CRLF 0391 // inside a quoted-string without it being part of FWS. 0392 // We take it verbatim. 0393 KMIME_WARN_NON_FOLDING(CRLF); 0394 result += QLatin1String("\r\n"); 0395 // the cursor is decremented again, so's we need not 0396 // duplicate the whole switch here. "ch" could've been 0397 // everything (incl. openChar or closeChar). 0398 scursor--; 0399 } 0400 } 0401 break; 0402 case '\n': 0403 // Note: CRLF has been handled above already! 0404 // ### LF needs special treatment, depending on whether isCRLF 0405 // is true (we can be sure a lonely '\n' was meant this way) or 0406 // false ('\n' alone could have meant LF or CRLF in the original 0407 // message. This parser assumes CRLF iff the LF is followed by 0408 // either WSP (folding) or NULL (premature end of quoted-string; 0409 // Should be fixed, since NULL is allowed as per rfc822). 0410 READ_ch_OR_FAIL; 0411 if (!isCRLF && (ch == ' ' || ch == '\t')) { 0412 // folding 0413 // correct folding 0414 result += QLatin1Char(ch); 0415 } else { 0416 // non-folding 0417 KMIME_WARN_LONE(LF); 0418 result += QLatin1Char('\n'); 0419 // pos is decremented, so's we need not duplicate the whole 0420 // switch here. ch could've been everything (incl. <">, "\"). 0421 scursor--; 0422 } 0423 break; 0424 case '=': { 0425 // ### Work around broken clients that send encoded words in quoted-strings 0426 // For example, older KMail versions. 0427 if (scursor == send) { 0428 break; 0429 } 0430 0431 const char *oldscursor = scursor; 0432 QString tmp; 0433 QByteArray lang; 0434 QByteArray charset; 0435 if (*scursor++ == '?') { 0436 --scursor; 0437 if (parseEncodedWord(scursor, send, tmp, lang, charset)) { 0438 result += tmp; 0439 //qDebug() << " tmp " << tmp; 0440 if (scursor == send) { 0441 break; 0442 } else if (*scursor++ == ' ') { //Workaround Bug 362650 thunderbird add space for each new line 0443 if (scursor == send) { 0444 --scursor; 0445 break; 0446 } else if (*scursor++ == '=') { 0447 if (scursor == send) { 0448 --scursor; 0449 --scursor; 0450 break; 0451 } else if (*scursor++ == '?') { 0452 --scursor; 0453 --scursor; 0454 break; 0455 } 0456 } else { 0457 --scursor; 0458 --scursor; 0459 } 0460 } else { 0461 --scursor; 0462 } 0463 0464 break; 0465 } else { 0466 scursor = oldscursor; 0467 } 0468 } else { 0469 scursor = oldscursor; 0470 } 0471 // fall through 0472 [[fallthrough]]; 0473 } 0474 default: 0475 KMIME_WARN_IF_8BIT(ch); 0476 result += QLatin1Char(ch); 0477 } 0478 } 0479 0480 return false; 0481 } 0482 0483 // known issues: 0484 // 0485 // - doesn't handle encoded-word inside comments. 0486 0487 bool parseComment(const char *&scursor, const char *const send, 0488 QString &result, bool isCRLF, bool reallySave) 0489 { 0490 int commentNestingDepth = 1; 0491 const char *afterLastClosingParenPos = nullptr; 0492 QString maybeCmnt; 0493 const char *oldscursor = scursor; 0494 0495 assert(*(scursor - 1) == '('); 0496 0497 while (commentNestingDepth) { 0498 QString cmntPart; 0499 if (parseGenericQuotedString(scursor, send, cmntPart, isCRLF, '(', ')')) { 0500 assert(*(scursor - 1) == ')' || *(scursor - 1) == '('); 0501 // see the kdoc for above function for the possible conditions 0502 // we have to check: 0503 switch (*(scursor - 1)) { 0504 case ')': 0505 if (reallySave) { 0506 // add the chunk that's now surely inside the comment. 0507 result += maybeCmnt; 0508 result += cmntPart; 0509 if (commentNestingDepth > 1) { 0510 // don't add the outermost ')'... 0511 result += QLatin1Char(')'); 0512 } 0513 maybeCmnt.clear(); 0514 } 0515 afterLastClosingParenPos = scursor; 0516 --commentNestingDepth; 0517 break; 0518 case '(': 0519 if (reallySave) { 0520 // don't add to "result" yet, because we might find that we 0521 // are already outside the (broken) comment... 0522 maybeCmnt += cmntPart; 0523 maybeCmnt += QLatin1Char('('); 0524 } 0525 ++commentNestingDepth; 0526 break; 0527 default: assert(0); 0528 } // switch 0529 } else { 0530 // !parseGenericQuotedString, ie. premature end 0531 if (afterLastClosingParenPos) { 0532 scursor = afterLastClosingParenPos; 0533 } else { 0534 scursor = oldscursor; 0535 } 0536 return false; 0537 } 0538 } // while 0539 0540 return true; 0541 } 0542 0543 // known issues: none. 0544 0545 bool parsePhrase(const char *&scursor, const char *const send, 0546 QString &result, bool isCRLF) 0547 { 0548 enum { 0549 None, Phrase, Atom, EncodedWord, QuotedString 0550 } found = None; 0551 0552 QString tmp; 0553 QByteArray lang; 0554 QByteArray charset; 0555 QPair<const char *, int> tmpAtom; 0556 const char *successfullyParsed = nullptr; 0557 // only used by the encoded-word branch 0558 const char *oldscursor; 0559 // used to suppress whitespace between adjacent encoded-words 0560 // (rfc2047, 6.2): 0561 bool lastWasEncodedWord = false; 0562 0563 while (scursor != send) { 0564 char ch = *scursor++; 0565 switch (ch) { 0566 case '.': // broken, but allow for intorop's sake 0567 if (found == None) { 0568 --scursor; 0569 return false; 0570 } else { 0571 if (scursor != send && (*scursor == ' ' || *scursor == '\t')) { 0572 result += QLatin1String(". "); 0573 } else { 0574 result += QLatin1Char('.'); 0575 } 0576 successfullyParsed = scursor; 0577 } 0578 break; 0579 case '"': // quoted-string 0580 tmp.clear(); 0581 if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) { 0582 successfullyParsed = scursor; 0583 assert(*(scursor - 1) == '"'); 0584 switch (found) { 0585 case None: 0586 found = QuotedString; 0587 break; 0588 case Phrase: 0589 case Atom: 0590 case EncodedWord: 0591 case QuotedString: 0592 found = Phrase; 0593 result += QLatin1Char(' '); // rfc822, 3.4.4 0594 break; 0595 default: 0596 assert(0); 0597 } 0598 lastWasEncodedWord = false; 0599 result += tmp; 0600 } else { 0601 // premature end of quoted string. 0602 // What to do? Return leading '"' as special? Return as quoted-string? 0603 // We do the latter if we already found something, else signal failure. 0604 if (found == None) { 0605 return false; 0606 } else { 0607 result += QLatin1Char(' '); // rfc822, 3.4.4 0608 result += tmp; 0609 return true; 0610 } 0611 } 0612 break; 0613 case '(': // comment 0614 // parse it, but ignore content: 0615 tmp.clear(); 0616 if (parseComment(scursor, send, tmp, isCRLF, 0617 false /*don't bother with the content*/)) { 0618 successfullyParsed = scursor; 0619 lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2 0620 } else { 0621 if (found == None) { 0622 return false; 0623 } else { 0624 scursor = successfullyParsed; 0625 return true; 0626 } 0627 } 0628 break; 0629 case '=': // encoded-word 0630 tmp.clear(); 0631 oldscursor = scursor; 0632 lang.clear(); 0633 charset.clear(); 0634 if (parseEncodedWord(scursor, send, tmp, lang, charset)) { 0635 successfullyParsed = scursor; 0636 switch (found) { 0637 case None: 0638 found = EncodedWord; 0639 break; 0640 case Phrase: 0641 case EncodedWord: 0642 case Atom: 0643 case QuotedString: 0644 if (!lastWasEncodedWord) { 0645 result += QLatin1Char(' '); // rfc822, 3.4.4 0646 } 0647 found = Phrase; 0648 break; 0649 default: assert(0); 0650 } 0651 lastWasEncodedWord = true; 0652 result += tmp; 0653 break; 0654 } else { 0655 // parse as atom: 0656 scursor = oldscursor; 0657 } 0658 [[fallthrough]]; 0659 // fall though... 0660 0661 default: //atom 0662 scursor--; 0663 if (parseAtom(scursor, send, tmpAtom, true /* allow 8bit */)) { 0664 successfullyParsed = scursor; 0665 switch (found) { 0666 case None: 0667 found = Atom; 0668 break; 0669 case Phrase: 0670 case Atom: 0671 case EncodedWord: 0672 case QuotedString: 0673 found = Phrase; 0674 result += QLatin1Char(' '); // rfc822, 3.4.4 0675 break; 0676 default: 0677 assert(0); 0678 } 0679 lastWasEncodedWord = false; 0680 result += QLatin1String(tmpAtom.first, tmpAtom.second); 0681 } else { 0682 if (found == None) { 0683 return false; 0684 } else { 0685 scursor = successfullyParsed; 0686 return true; 0687 } 0688 } 0689 } 0690 eatWhiteSpace(scursor, send); 0691 } 0692 0693 return found != None; 0694 } 0695 0696 bool parseDotAtom(const char *&scursor, const char *const send, 0697 QByteArray &result, bool isCRLF) 0698 { 0699 eatCFWS(scursor, send, isCRLF); 0700 0701 // always points to just after the last atom parsed: 0702 const char *successfullyParsed; 0703 0704 QByteArray tmp; 0705 if (!parseAtom(scursor, send, tmp, false /* no 8bit */)) { 0706 return false; 0707 } 0708 result += tmp; 0709 successfullyParsed = scursor; 0710 0711 while (scursor != send) { 0712 0713 // end of header or no '.' -> return 0714 if (scursor == send || *scursor != '.') { 0715 return true; 0716 } 0717 scursor++; // eat '.' 0718 0719 if (scursor == send || !isAText(*scursor)) { 0720 // end of header or no AText, but this time following a '.'!: 0721 // reset cursor to just after last successfully parsed char and 0722 // return: 0723 scursor = successfullyParsed; 0724 return true; 0725 } 0726 0727 // try to parse the next atom: 0728 QByteArray maybeAtom; 0729 if (!parseAtom(scursor, send, maybeAtom, false /*no 8bit*/)) { 0730 scursor = successfullyParsed; 0731 return true; 0732 } 0733 0734 result += '.'; 0735 result += maybeAtom; 0736 successfullyParsed = scursor; 0737 } 0738 0739 scursor = successfullyParsed; 0740 return true; 0741 } 0742 0743 void eatCFWS(const char *&scursor, const char *const send, bool isCRLF) 0744 { 0745 QString dummy; 0746 0747 while (scursor != send) { 0748 const char *oldscursor = scursor; 0749 0750 char ch = *scursor++; 0751 0752 switch (ch) { 0753 case ' ': 0754 case '\t': // whitespace 0755 case '\r': 0756 case '\n': // folding 0757 continue; 0758 0759 case '(': // comment 0760 if (parseComment(scursor, send, dummy, isCRLF, false /*don't save*/)) { 0761 continue; 0762 } 0763 scursor = oldscursor; 0764 return; 0765 0766 default: 0767 scursor = oldscursor; 0768 return; 0769 } 0770 } 0771 } 0772 0773 bool parseDomain(const char *&scursor, const char *const send, 0774 QString &result, bool isCRLF) 0775 { 0776 eatCFWS(scursor, send, isCRLF); 0777 if (scursor == send) { 0778 return false; 0779 } 0780 0781 // domain := dot-atom / domain-literal / atom *("." atom) 0782 // 0783 // equivalent to: 0784 // domain = dot-atom / domain-literal, 0785 // since parseDotAtom does allow CFWS between atoms and dots 0786 0787 if (*scursor == '[') { 0788 // domain-literal: 0789 QString maybeDomainLiteral; 0790 // eat '[': 0791 scursor++; 0792 while (parseGenericQuotedString(scursor, send, maybeDomainLiteral, 0793 isCRLF, '[', ']')) { 0794 if (scursor == send) { 0795 // end of header: check for closing ']': 0796 if (*(scursor - 1) == ']') { 0797 // OK, last char was ']': 0798 result = maybeDomainLiteral; 0799 return true; 0800 } else { 0801 // not OK, domain-literal wasn't closed: 0802 return false; 0803 } 0804 } 0805 // we hit openChar in parseGenericQuotedString. 0806 // include it in maybeDomainLiteral and keep on parsing: 0807 if (*(scursor - 1) == '[') { 0808 maybeDomainLiteral += QLatin1Char('['); 0809 continue; 0810 } 0811 // OK, real end of domain-literal: 0812 result = maybeDomainLiteral; 0813 return true; 0814 } 0815 } else { 0816 // dot-atom: 0817 QByteArray maybeDotAtom; 0818 if (parseDotAtom(scursor, send, maybeDotAtom, isCRLF)) { 0819 // Domain may end with '.', if so preserve it' 0820 if (scursor != send && *scursor == '.') { 0821 maybeDotAtom += '.'; 0822 scursor++; 0823 } 0824 result = QString::fromLatin1(maybeDotAtom); 0825 return true; 0826 } 0827 } 0828 return false; 0829 } 0830 0831 bool parseObsRoute(const char *&scursor, const char *const send, 0832 QStringList &result, bool isCRLF, bool save) 0833 { 0834 while (scursor != send) { 0835 eatCFWS(scursor, send, isCRLF); 0836 if (scursor == send) { 0837 return false; 0838 } 0839 0840 // empty entry: 0841 if (*scursor == ',') { 0842 scursor++; 0843 if (save) { 0844 result.append(QString()); 0845 } 0846 continue; 0847 } 0848 0849 // empty entry ending the list: 0850 if (*scursor == ':') { 0851 scursor++; 0852 if (save) { 0853 result.append(QString()); 0854 } 0855 return true; 0856 } 0857 0858 // each non-empty entry must begin with '@': 0859 if (*scursor != '@') { 0860 return false; 0861 } else { 0862 scursor++; 0863 } 0864 0865 QString maybeDomain; 0866 if (!parseDomain(scursor, send, maybeDomain, isCRLF)) { 0867 return false; 0868 } 0869 if (save) { 0870 result.append(maybeDomain); 0871 } 0872 0873 // eat the following (optional) comma: 0874 eatCFWS(scursor, send, isCRLF); 0875 if (scursor == send) { 0876 return false; 0877 } 0878 if (*scursor == ':') { 0879 scursor++; 0880 return true; 0881 } 0882 if (*scursor == ',') { 0883 scursor++; 0884 } 0885 } 0886 0887 return false; 0888 } 0889 0890 bool parseAddrSpec(const char *&scursor, const char *const send, 0891 AddrSpec &result, bool isCRLF) 0892 { 0893 // 0894 // STEP 1: 0895 // local-part := dot-atom / quoted-string / word *("." word) 0896 // 0897 // this is equivalent to: 0898 // local-part := word *("." word) 0899 0900 QString maybeLocalPart; 0901 QString tmp; 0902 QPair<const char *, int> tmpAtom; 0903 0904 while (scursor != send) { 0905 // first, eat any whitespace 0906 eatCFWS(scursor, send, isCRLF); 0907 0908 char ch = *scursor++; 0909 switch (ch) { 0910 case '.': // dot 0911 maybeLocalPart += QLatin1Char('.'); 0912 break; 0913 0914 case '@': 0915 goto SAW_AT_SIGN; 0916 break; 0917 0918 case '"': // quoted-string 0919 tmp.clear(); 0920 if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) { 0921 maybeLocalPart += tmp; 0922 } else { 0923 return false; 0924 } 0925 break; 0926 0927 default: // atom 0928 scursor--; // re-set scursor to point to ch again 0929 if (parseAtom(scursor, send, tmpAtom, false /* no 8bit */)) { 0930 maybeLocalPart += QLatin1String(tmpAtom.first, tmpAtom.second); 0931 } else { 0932 return false; // parseAtom can only fail if the first char is non-atext. 0933 } 0934 break; 0935 } 0936 } 0937 0938 return false; 0939 0940 // 0941 // STEP 2: 0942 // domain 0943 // 0944 0945 SAW_AT_SIGN: 0946 0947 assert(*(scursor - 1) == '@'); 0948 0949 QString maybeDomain; 0950 if (!parseDomain(scursor, send, maybeDomain, isCRLF)) { 0951 return false; 0952 } 0953 0954 result.localPart = maybeLocalPart; 0955 result.domain = maybeDomain; 0956 0957 return true; 0958 } 0959 0960 bool parseAngleAddr(const char *&scursor, const char *const send, 0961 AddrSpec &result, bool isCRLF) 0962 { 0963 // first, we need an opening angle bracket: 0964 eatCFWS(scursor, send, isCRLF); 0965 if (scursor == send || *scursor != '<') { 0966 return false; 0967 } 0968 scursor++; // eat '<' 0969 0970 eatCFWS(scursor, send, isCRLF); 0971 if (scursor == send) { 0972 return false; 0973 } 0974 0975 if (*scursor == '@' || *scursor == ',') { 0976 // obs-route: parse, but ignore: 0977 KMIME_WARN << "obsolete source route found! ignoring."; 0978 QStringList dummy; 0979 if (!parseObsRoute(scursor, send, dummy, 0980 isCRLF, false /* don't save */)) { 0981 return false; 0982 } 0983 // angle-addr isn't complete until after the '>': 0984 if (scursor == send) { 0985 return false; 0986 } 0987 } 0988 0989 // parse addr-spec: 0990 AddrSpec maybeAddrSpec; 0991 if (!parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) { 0992 return false; 0993 } 0994 0995 eatCFWS(scursor, send, isCRLF); 0996 if (scursor == send || *scursor != '>') { 0997 return false; 0998 } 0999 scursor++; 1000 1001 result = maybeAddrSpec; 1002 return true; 1003 1004 } 1005 1006 static QString stripQuotes(const QString &input) 1007 { 1008 const QLatin1Char quotes('"'); 1009 if (input.startsWith(quotes) && input.endsWith(quotes)) { 1010 QString stripped(input.mid(1, input.size() - 2)); 1011 return stripped; 1012 } else { 1013 return input; 1014 } 1015 } 1016 1017 bool parseMailbox(const char *&scursor, const char *const send, 1018 Mailbox &result, bool isCRLF) 1019 { 1020 eatCFWS(scursor, send, isCRLF); 1021 if (scursor == send) { 1022 return false; 1023 } 1024 1025 AddrSpec maybeAddrSpec; 1026 QString maybeDisplayName; 1027 1028 // first, try if it's a vanilla addr-spec: 1029 const char *oldscursor = scursor; 1030 if (parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) { 1031 result.setAddress(maybeAddrSpec); 1032 // check for the obsolete form of display-name (as comment): 1033 eatWhiteSpace(scursor, send); 1034 if (scursor != send && *scursor == '(') { 1035 scursor++; 1036 if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) { 1037 return false; 1038 } 1039 } 1040 result.setName(stripQuotes(maybeDisplayName)); 1041 return true; 1042 } 1043 scursor = oldscursor; 1044 1045 // second, see if there's a display-name: 1046 if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) { 1047 // failed: reset cursor, note absent display-name 1048 maybeDisplayName.clear(); 1049 scursor = oldscursor; 1050 } else { 1051 // succeeded: eat CFWS 1052 eatCFWS(scursor, send, isCRLF); 1053 if (scursor == send) { 1054 return false; 1055 } 1056 } 1057 1058 // third, parse the angle-addr: 1059 if (!parseAngleAddr(scursor, send, maybeAddrSpec, isCRLF)) { 1060 return false; 1061 } 1062 1063 if (maybeDisplayName.isNull()) { 1064 // check for the obsolete form of display-name (as comment): 1065 eatWhiteSpace(scursor, send); 1066 if (scursor != send && *scursor == '(') { 1067 scursor++; 1068 if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) { 1069 return false; 1070 } 1071 } 1072 } 1073 1074 result.setName(stripQuotes(maybeDisplayName)); 1075 result.setAddress(maybeAddrSpec); 1076 return true; 1077 } 1078 1079 bool parseGroup(const char *&scursor, const char *const send, 1080 Address &result, bool isCRLF) 1081 { 1082 // group := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS] 1083 // 1084 // equivalent to: 1085 // group := display-name ":" [ obs-mbox-list ] ";" 1086 1087 eatCFWS(scursor, send, isCRLF); 1088 if (scursor == send) { 1089 return false; 1090 } 1091 1092 // get display-name: 1093 QString maybeDisplayName; 1094 if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) { 1095 return false; 1096 } 1097 1098 // get ":": 1099 eatCFWS(scursor, send, isCRLF); 1100 if (scursor == send || *scursor != ':') { 1101 return false; 1102 } 1103 1104 // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that 1105 // automatically calls removeBidiControlChars 1106 result.displayName = removeBidiControlChars(maybeDisplayName); 1107 1108 // get obs-mbox-list (may contain empty entries): 1109 scursor++; 1110 while (scursor != send) { 1111 eatCFWS(scursor, send, isCRLF); 1112 if (scursor == send) { 1113 return false; 1114 } 1115 1116 // empty entry: 1117 if (*scursor == ',') { 1118 scursor++; 1119 continue; 1120 } 1121 1122 // empty entry ending the list: 1123 if (*scursor == ';') { 1124 scursor++; 1125 return true; 1126 } 1127 1128 Mailbox maybeMailbox; 1129 if (!parseMailbox(scursor, send, maybeMailbox, isCRLF)) { 1130 return false; 1131 } 1132 result.mailboxList.append(maybeMailbox); 1133 1134 eatCFWS(scursor, send, isCRLF); 1135 // premature end: 1136 if (scursor == send) { 1137 return false; 1138 } 1139 // regular end of the list: 1140 if (*scursor == ';') { 1141 scursor++; 1142 return true; 1143 } 1144 // eat regular list entry separator: 1145 if (*scursor == ',') { 1146 scursor++; 1147 } 1148 } 1149 return false; 1150 } 1151 1152 bool parseAddress(const char *&scursor, const char *const send, 1153 Address &result, bool isCRLF) 1154 { 1155 // address := mailbox / group 1156 1157 eatCFWS(scursor, send, isCRLF); 1158 if (scursor == send) { 1159 return false; 1160 } 1161 1162 // first try if it's a single mailbox: 1163 Mailbox maybeMailbox; 1164 const char *oldscursor = scursor; 1165 if (parseMailbox(scursor, send, maybeMailbox, isCRLF)) { 1166 // yes, it is: 1167 result.displayName.clear(); 1168 result.mailboxList.append(maybeMailbox); 1169 return true; 1170 } 1171 scursor = oldscursor; 1172 1173 Address maybeAddress; 1174 1175 // no, it's not a single mailbox. Try if it's a group: 1176 if (!parseGroup(scursor, send, maybeAddress, isCRLF)) { 1177 return false; 1178 } 1179 1180 result = maybeAddress; 1181 return true; 1182 } 1183 1184 bool parseAddressList(const char *&scursor, const char *const send, 1185 AddressList &result, bool isCRLF) 1186 { 1187 while (scursor != send) { 1188 eatCFWS(scursor, send, isCRLF); 1189 // end of header: this is OK. 1190 if (scursor == send) { 1191 return true; 1192 } 1193 // empty entry: ignore: 1194 if (*scursor == ',') { 1195 scursor++; 1196 continue; 1197 } 1198 // broken clients might use ';' as list delimiter, accept that as well 1199 if (*scursor == ';') { 1200 scursor++; 1201 continue; 1202 } 1203 1204 // parse one entry 1205 Address maybeAddress; 1206 if (!parseAddress(scursor, send, maybeAddress, isCRLF)) { 1207 return false; 1208 } 1209 result.append(maybeAddress); 1210 1211 eatCFWS(scursor, send, isCRLF); 1212 // end of header: this is OK. 1213 if (scursor == send) { 1214 return true; 1215 } 1216 // comma separating entries: eat it. 1217 if (*scursor == ',') { 1218 scursor++; 1219 } 1220 } 1221 return true; 1222 } 1223 1224 static bool parseParameter(const char *&scursor, const char *const send, 1225 QPair<QString, QStringOrQPair> &result, bool isCRLF) 1226 { 1227 // parameter = regular-parameter / extended-parameter 1228 // regular-parameter = regular-parameter-name "=" value 1229 // extended-parameter = 1230 // value = token / quoted-string 1231 // 1232 // note that rfc2231 handling is out of the scope of this function. 1233 // Therefore we return the attribute as QByteArray and the value as 1234 // (start,length) tuple if we see that the value is encoded 1235 // (trailing asterisk), for parseParameterList to decode... 1236 1237 eatCFWS(scursor, send, isCRLF); 1238 if (scursor == send) { 1239 return false; 1240 } 1241 1242 // 1243 // parse the parameter name: 1244 // 1245 QByteArray tmpAttr; 1246 if (!parseToken(scursor, send, tmpAttr, ParseTokenNoFlag)) { 1247 return false; 1248 } 1249 // FIXME: we could use QMap<QByteArray, ...> in the API for parameters 1250 QString maybeAttribute = QString::fromLatin1(tmpAttr); 1251 1252 eatCFWS(scursor, send, isCRLF); 1253 // premature end: not OK (haven't seen '=' yet). 1254 if (scursor == send || *scursor != '=') { 1255 return false; 1256 } 1257 scursor++; // eat '=' 1258 1259 eatCFWS(scursor, send, isCRLF); 1260 if (scursor == send) { 1261 // don't choke on attribute=, meaning the value was omitted: 1262 if (maybeAttribute.endsWith(QLatin1Char('*'))) { 1263 KMIME_WARN << "attribute ends with \"*\", but value is empty!" 1264 "Chopping away \"*\"."; 1265 maybeAttribute.chop(1); 1266 } 1267 result = qMakePair(maybeAttribute.toLower(), QStringOrQPair()); 1268 return true; 1269 } 1270 1271 const char *oldscursor = scursor; 1272 1273 // 1274 // parse the parameter value: 1275 // 1276 QStringOrQPair maybeValue; 1277 if (*scursor == '"') { 1278 // value is a quoted-string: 1279 scursor++; 1280 if (maybeAttribute.endsWith(QLatin1Char('*'))) { 1281 // attributes ending with "*" designate extended-parameters, 1282 // which cannot have quoted-strings as values. So we remove the 1283 // trailing "*" to not confuse upper layers. 1284 KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!" 1285 "Chopping away \"*\"."; 1286 maybeAttribute.chop(1); 1287 } 1288 1289 if (!parseGenericQuotedString(scursor, send, maybeValue.qstring, isCRLF)) { 1290 scursor = oldscursor; 1291 result = qMakePair(maybeAttribute.toLower(), QStringOrQPair()); 1292 return false; // this case needs further processing by upper layers!! 1293 } 1294 } else { 1295 // value is a token: 1296 if (!parseToken(scursor, send, maybeValue.qpair, ParseTokenRelaxedTText)) { 1297 scursor = oldscursor; 1298 result = qMakePair(maybeAttribute.toLower(), QStringOrQPair()); 1299 return false; // this case needs further processing by upper layers!! 1300 } 1301 } 1302 1303 result = qMakePair(maybeAttribute.toLower(), maybeValue); 1304 return true; 1305 } 1306 1307 static bool parseRawParameterList(const char *&scursor, const char *const send, 1308 QMap<QString, QStringOrQPair> &result, 1309 bool isCRLF) 1310 { 1311 // we use parseParameter() consecutively to obtain a map of raw 1312 // attributes to raw values. "Raw" here means that we don't do 1313 // rfc2231 decoding and concatenation. This is left to 1314 // parseParameterList(), which will call this function. 1315 // 1316 // The main reason for making this chunk of code a separate 1317 // (private) method is that we can deal with broken parameters 1318 // _here_ and leave the rfc2231 handling solely to 1319 // parseParameterList(), which will still be enough work. 1320 while (scursor != send) { 1321 eatCFWS(scursor, send, isCRLF); 1322 // empty entry ending the list: OK. 1323 if (scursor == send) { 1324 return true; 1325 } 1326 // empty list entry: ignore. 1327 if (*scursor == ';') { 1328 scursor++; 1329 continue; 1330 } 1331 QPair<QString, QStringOrQPair> maybeParameter; 1332 if (!parseParameter(scursor, send, maybeParameter, isCRLF)) { 1333 // we need to do a bit of work if the attribute is not 1334 // NULL. These are the cases marked with "needs further 1335 // processing" in parseParameter(). Specifically, parsing of the 1336 // token or the quoted-string, which should represent the value, 1337 // failed. We take the easy way out and simply search for the 1338 // next ';' to start parsing again. (Another option would be to 1339 // take the text between '=' and ';' as value) 1340 if (maybeParameter.first.isNull()) { 1341 return false; 1342 } 1343 while (scursor != send) { 1344 if (*scursor++ == ';') { 1345 goto IS_SEMICOLON; 1346 } 1347 } 1348 // scursor == send case: end of list. 1349 return true; 1350 IS_SEMICOLON: 1351 // *scursor == ';' case: parse next entry. 1352 continue; 1353 } 1354 // successful parsing brings us here: 1355 result.insert(maybeParameter.first, maybeParameter.second); 1356 1357 eatCFWS(scursor, send, isCRLF); 1358 // end of header: ends list. 1359 if (scursor == send) { 1360 return true; 1361 } 1362 // regular separator: eat it. 1363 if (*scursor == ';') { 1364 scursor++; 1365 } 1366 } 1367 return true; 1368 } 1369 1370 static void decodeRFC2231Value(KCodecs::Codec *&rfc2231Codec, 1371 QStringDecoder &textcodec, 1372 bool isContinuation, QString &value, 1373 QPair<const char *, int> &source, QByteArray &charset) 1374 { 1375 // 1376 // parse the raw value into (charset,language,text): 1377 // 1378 1379 const char *decBegin = source.first; 1380 const char *decCursor = decBegin; 1381 const char *decEnd = decCursor + source.second; 1382 1383 if (!isContinuation) { 1384 // find the first single quote 1385 while (decCursor != decEnd) { 1386 if (*decCursor == '\'') { 1387 break; 1388 } else { 1389 decCursor++; 1390 } 1391 } 1392 1393 if (decCursor == decEnd) { 1394 // there wasn't a single single quote at all! 1395 // take the whole value to be in latin-1: 1396 KMIME_WARN << "No charset in extended-initial-value." 1397 "Assuming \"iso-8859-1\"."; 1398 value += QString::fromLatin1(decBegin, source.second); 1399 return; 1400 } 1401 1402 charset = QByteArray(decBegin, decCursor - decBegin); 1403 1404 const char *oldDecCursor = ++decCursor; 1405 // find the second single quote (we ignore the language tag): 1406 while (decCursor != decEnd) { 1407 if (*decCursor == '\'') { 1408 break; 1409 } else { 1410 decCursor++; 1411 } 1412 } 1413 if (decCursor == decEnd) { 1414 KMIME_WARN << "No language in extended-initial-value." 1415 "Trying to recover."; 1416 decCursor = oldDecCursor; 1417 } else { 1418 decCursor++; 1419 } 1420 1421 // decCursor now points to the start of the 1422 // "extended-other-values": 1423 1424 // 1425 // get the decoders: 1426 // 1427 1428 textcodec = QStringDecoder(charset.constData()); 1429 if (!textcodec.isValid()) { 1430 KMIME_WARN_UNKNOWN(Charset, charset); 1431 } 1432 } 1433 1434 if (!rfc2231Codec) { 1435 rfc2231Codec = KCodecs::Codec::codecForName("x-kmime-rfc2231"); 1436 assert(rfc2231Codec); 1437 } 1438 1439 if (!textcodec.isValid()) { 1440 value += QString::fromLatin1(decCursor, decEnd - decCursor); 1441 return; 1442 } 1443 1444 KCodecs::Decoder *dec = rfc2231Codec->makeDecoder(); 1445 assert(dec); 1446 1447 // 1448 // do the decoding: 1449 // 1450 1451 QByteArray buffer; 1452 buffer.resize(rfc2231Codec->maxDecodedSizeFor(decEnd - decCursor)); 1453 QByteArray::Iterator bit = buffer.begin(); 1454 QByteArray::ConstIterator bend = buffer.end(); 1455 1456 if (!dec->decode(decCursor, decEnd, bit, bend)) { 1457 KMIME_WARN << rfc2231Codec->name() 1458 << "codec lies about its maxDecodedSizeFor()" 1459 << Qt::endl 1460 << "result may be truncated"; 1461 } 1462 1463 value += textcodec.decode(QByteArrayView(buffer.begin(), bit - buffer.begin())); 1464 1465 // qCDebug(KMIME_LOG) << "value now: \"" << value << "\""; 1466 // cleanup: 1467 delete dec; 1468 } 1469 1470 // known issues: 1471 // - permutes rfc2231 continuations when the total number of parts 1472 // exceeds 10 (other-sections then becomes *xy, ie. two digits) 1473 1474 bool parseParameterListWithCharset(const char *&scursor, 1475 const char *const send, 1476 QMap<QString, QString> &result, 1477 QByteArray &charset, bool isCRLF) 1478 { 1479 // parse the list into raw attribute-value pairs: 1480 QMap<QString, QStringOrQPair> rawParameterList; 1481 if (!parseRawParameterList(scursor, send, rawParameterList, isCRLF)) { 1482 return false; 1483 } 1484 1485 if (rawParameterList.isEmpty()) { 1486 return true; 1487 } 1488 1489 // decode rfc 2231 continuations and alternate charset encoding: 1490 1491 // NOTE: this code assumes that what QMapIterator delivers is sorted 1492 // by the key! 1493 1494 KCodecs::Codec *rfc2231Codec = nullptr; 1495 QStringDecoder textcodec; 1496 QString attribute; 1497 QString value; 1498 enum Mode { 1499 NoMode = 0x0, Continued = 0x1, Encoded = 0x2 1500 }; 1501 1502 enum EncodingMode { 1503 NoEncoding, 1504 RFC2047, 1505 RFC2231 1506 }; 1507 1508 QMap<QString, QStringOrQPair>::Iterator it; 1509 QMap<QString, QStringOrQPair>::Iterator end = rawParameterList.end(); 1510 1511 for (it = rawParameterList.begin() ; it != end ; ++it) { 1512 if (attribute.isNull() || !it.key().startsWith(attribute)) { 1513 // 1514 // new attribute: 1515 // 1516 1517 // store the last attribute/value pair in the result map now: 1518 if (!attribute.isNull()) { 1519 result.insert(attribute, value); 1520 } 1521 // and extract the information from the new raw attribute: 1522 value.clear(); 1523 attribute = it.key(); 1524 int mode = NoMode; 1525 EncodingMode encodingMode = NoEncoding; 1526 1527 // is the value rfc2331-encoded? 1528 if (attribute.endsWith(QLatin1Char('*'))) { 1529 attribute.chop(1); 1530 mode |= Encoded; 1531 encodingMode = RFC2231; 1532 } 1533 // is the value rfc2047-encoded? 1534 if (!(*it).qstring.isNull() && (*it).qstring.contains(QLatin1String("=?"))) { 1535 mode |= Encoded; 1536 encodingMode = RFC2047; 1537 } 1538 // is the value continued? 1539 if (attribute.endsWith(QLatin1String("*0"))) { 1540 attribute.chop(2); 1541 mode |= Continued; 1542 } 1543 // 1544 // decode if necessary: 1545 // 1546 if (mode & Encoded) { 1547 if (encodingMode == RFC2231) { 1548 decodeRFC2231Value(rfc2231Codec, textcodec, 1549 false, /* isn't continuation */ 1550 value, (*it).qpair, charset); 1551 } else if (encodingMode == RFC2047) { 1552 value += KCodecs::decodeRFC2047String((*it).qstring.toLatin1(), &charset); 1553 } 1554 } else { 1555 // not encoded. 1556 if ((*it).qpair.first) { 1557 value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second); 1558 } else { 1559 value += (*it).qstring; 1560 } 1561 } 1562 1563 // 1564 // shortcut-processing when the value isn't encoded: 1565 // 1566 1567 if (!(mode & Continued)) { 1568 // save result already: 1569 result.insert(attribute, value); 1570 // force begin of a new attribute: 1571 attribute.clear(); 1572 } 1573 } else { // it.key().startsWith( attribute ) 1574 // 1575 // continuation 1576 // 1577 1578 // ignore the section and trust QMap to have sorted the keys: 1579 if (it.key().endsWith(QLatin1Char('*'))) { 1580 // encoded 1581 decodeRFC2231Value(rfc2231Codec, textcodec, 1582 true, /* is continuation */ 1583 value, (*it).qpair, charset); 1584 } else { 1585 // not encoded 1586 if ((*it).qpair.first) { 1587 value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second); 1588 } else { 1589 value += (*it).qstring; 1590 } 1591 } 1592 } 1593 } 1594 // write last attr/value pair: 1595 if (!attribute.isNull()) { 1596 result.insert(attribute, value); 1597 } 1598 1599 return true; 1600 } 1601 1602 bool parseParameterList(const char *&scursor, const char *const send, 1603 QMap<QString, QString> &result, bool isCRLF) 1604 { 1605 QByteArray charset; 1606 return parseParameterListWithCharset(scursor, send, result, charset, isCRLF); 1607 } 1608 1609 static const char stdDayNames[][4] = { 1610 "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" 1611 }; 1612 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames; 1613 1614 static bool parseDayName(const char *&scursor, const char *const send) 1615 { 1616 // check bounds: 1617 if (send - scursor < 3) { 1618 return false; 1619 } 1620 1621 for (int i = 0 ; i < stdDayNamesLen ; ++i) { 1622 if (qstrnicmp(scursor, stdDayNames[i], 3) == 0) { 1623 scursor += 3; 1624 // qCDebug(KMIME_LOG) << "found" << stdDayNames[i]; 1625 return true; 1626 } 1627 } 1628 1629 return false; 1630 } 1631 1632 static const char stdMonthNames[][4] = { 1633 "Jan", "Feb", "Mar", "Apr", "May", "Jun", 1634 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" 1635 }; 1636 static const int stdMonthNamesLen = 1637 sizeof stdMonthNames / sizeof *stdMonthNames; 1638 1639 static bool parseMonthName(const char *&scursor, const char *const send, 1640 int &result) 1641 { 1642 // check bounds: 1643 if (send - scursor < 3) { 1644 return false; 1645 } 1646 1647 for (result = 0 ; result < stdMonthNamesLen ; ++result) { 1648 if (qstrnicmp(scursor, stdMonthNames[result], 3) == 0) { 1649 scursor += 3; 1650 return true; 1651 } 1652 } 1653 1654 // not found: 1655 return false; 1656 } 1657 1658 static const struct { 1659 const char tzName[5]; 1660 long int secsEastOfGMT; 1661 } timeZones[] = { 1662 // rfc 822 timezones: 1663 { "GMT", 0 }, 1664 { "UT", 0 }, 1665 { "EDT", -4 * 3600 }, 1666 { "EST", -5 * 3600 }, 1667 { "MST", -5 * 3600 }, 1668 { "CST", -6 * 3600 }, 1669 { "MDT", -6 * 3600 }, 1670 { "MST", -7 * 3600 }, 1671 { "PDT", -7 * 3600 }, 1672 { "PST", -8 * 3600 }, 1673 // common, non-rfc-822 zones: 1674 { "CET", 1 * 3600 }, 1675 { "MET", 1 * 3600 }, 1676 { "UTC", 0 }, 1677 { "CEST", 2 * 3600 }, 1678 { "BST", 1 * 3600 }, 1679 // rfc 822 military timezones: 1680 { "Z", 0 }, 1681 { "A", -1 * 3600 }, 1682 { "B", -2 * 3600 }, 1683 { "C", -3 * 3600 }, 1684 { "D", -4 * 3600 }, 1685 { "E", -5 * 3600 }, 1686 { "F", -6 * 3600 }, 1687 { "G", -7 * 3600 }, 1688 { "H", -8 * 3600 }, 1689 { "I", -9 * 3600 }, 1690 // J is not used! 1691 { "K", -10 * 3600 }, 1692 { "L", -11 * 3600 }, 1693 { "M", -12 * 3600 }, 1694 { "N", 1 * 3600 }, 1695 { "O", 2 * 3600 }, 1696 { "P", 3 * 3600 }, 1697 { "Q", 4 * 3600 }, 1698 { "R", 5 * 3600 }, 1699 { "S", 6 * 3600 }, 1700 { "T", 7 * 3600 }, 1701 { "U", 8 * 3600 }, 1702 { "V", 9 * 3600 }, 1703 { "W", 10 * 3600 }, 1704 { "X", 11 * 3600 }, 1705 { "Y", 12 * 3600 }, 1706 }; 1707 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones; 1708 1709 static bool parseAlphaNumericTimeZone(const char *&scursor, 1710 const char *const send, 1711 long int &secsEastOfGMT, 1712 bool &timeZoneKnown) 1713 { 1714 // allow the timezone to be wrapped in quotes; bug 260761 1715 if (scursor < send && *scursor == '"') { 1716 scursor++; 1717 1718 if (scursor == send) { 1719 return false; 1720 } 1721 } 1722 1723 QPair<const char *, int> maybeTimeZone(nullptr, 0); 1724 if (!parseToken(scursor, send, maybeTimeZone, ParseTokenNoFlag)) { 1725 return false; 1726 } 1727 for (int i = 0 ; i < timeZonesLen ; ++i) { 1728 if (qstrnicmp(timeZones[i].tzName, 1729 maybeTimeZone.first, maybeTimeZone.second) == 0) { 1730 scursor += maybeTimeZone.second; 1731 secsEastOfGMT = timeZones[i].secsEastOfGMT; 1732 timeZoneKnown = true; 1733 1734 if (scursor < send && *scursor == '"') { 1735 scursor++; 1736 } 1737 1738 return true; 1739 } 1740 } 1741 1742 // don't choke just because we don't happen to know the time zone 1743 KMIME_WARN_UNKNOWN(time zone, 1744 QByteArray(maybeTimeZone.first, maybeTimeZone.second)); 1745 secsEastOfGMT = 0; 1746 timeZoneKnown = false; 1747 return true; 1748 } 1749 1750 // parse a number and return the number of digits parsed: 1751 int parseDigits(const char *&scursor, const char *const send, int &result) 1752 { 1753 result = 0; 1754 int digits = 0; 1755 for (; scursor != send && isdigit(*scursor) ; scursor++, digits++) { 1756 result *= 10; 1757 result += int(*scursor - '0'); 1758 } 1759 return digits; 1760 } 1761 1762 static bool parseTimeOfDay(const char *&scursor, const char *const send, 1763 int &hour, int &min, int &sec, bool isCRLF = false) 1764 { 1765 // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ] 1766 1767 // 1768 // 2DIGIT representing "hour": 1769 // 1770 if (!parseDigits(scursor, send, hour)) { 1771 return false; 1772 } 1773 1774 eatCFWS(scursor, send, isCRLF); 1775 if (scursor == send || *scursor != ':') { 1776 return false; 1777 } 1778 scursor++; // eat ':' 1779 1780 eatCFWS(scursor, send, isCRLF); 1781 if (scursor == send) { 1782 return false; 1783 } 1784 1785 // 1786 // 2DIGIT representing "minute": 1787 // 1788 if (!parseDigits(scursor, send, min)) { 1789 return false; 1790 } 1791 1792 eatCFWS(scursor, send, isCRLF); 1793 if (scursor == send) { 1794 return true; // seconds are optional 1795 } 1796 1797 // 1798 // let's see if we have a 2DIGIT representing "second": 1799 // 1800 if (*scursor == ':') { 1801 // yepp, there are seconds: 1802 scursor++; // eat ':' 1803 eatCFWS(scursor, send, isCRLF); 1804 if (scursor == send) { 1805 return false; 1806 } 1807 1808 if (!parseDigits(scursor, send, sec)) { 1809 return false; 1810 } 1811 } else { 1812 sec = 0; 1813 } 1814 1815 return true; 1816 } 1817 1818 bool parseTime(const char *&scursor, const char *send, 1819 int &hour, int &min, int &sec, long int &secsEastOfGMT, 1820 bool &timeZoneKnown, bool isCRLF) 1821 { 1822 // time := time-of-day CFWS ( zone / obs-zone ) 1823 // 1824 // obs-zone := "UT" / "GMT" / 1825 // "EST" / "EDT" / ; -0500 / -0400 1826 // "CST" / "CDT" / ; -0600 / -0500 1827 // "MST" / "MDT" / ; -0700 / -0600 1828 // "PST" / "PDT" / ; -0800 / -0700 1829 // "A"-"I" / "a"-"i" / 1830 // "K"-"Z" / "k"-"z" 1831 1832 eatCFWS(scursor, send, isCRLF); 1833 if (scursor == send) { 1834 return false; 1835 } 1836 1837 if (!parseTimeOfDay(scursor, send, hour, min, sec, isCRLF)) { 1838 return false; 1839 } 1840 1841 eatCFWS(scursor, send, isCRLF); 1842 // there might be no timezone but a year following 1843 if ((scursor == send) || isdigit(*scursor)) { 1844 timeZoneKnown = false; 1845 secsEastOfGMT = 0; 1846 return true; // allow missing timezone 1847 } 1848 1849 timeZoneKnown = true; 1850 if (*scursor == '+' || *scursor == '-') { 1851 // remember and eat '-'/'+': 1852 const char sign = *scursor++; 1853 // numerical timezone: 1854 int maybeTimeZone; 1855 const int tzDigits = parseDigits(scursor, send, maybeTimeZone); 1856 if (tzDigits != 4) { 1857 // Allow timezones in 02:00 format 1858 if (tzDigits == 2 && scursor != send && *scursor == ':') { 1859 scursor++; 1860 int maybeTimeZone2; 1861 if (parseDigits(scursor, send, maybeTimeZone2) != 2) { 1862 return false; 1863 } 1864 maybeTimeZone = maybeTimeZone * 100 + maybeTimeZone2; 1865 } else { 1866 return false; 1867 } 1868 } 1869 secsEastOfGMT = 60 * (maybeTimeZone / 100 * 60 + maybeTimeZone % 100); 1870 if (sign == '-') { 1871 secsEastOfGMT *= -1; 1872 if (secsEastOfGMT == 0) { 1873 timeZoneKnown = false; // -0000 means indetermined tz 1874 } 1875 } 1876 } else { 1877 // maybe alphanumeric timezone: 1878 if (!parseAlphaNumericTimeZone(scursor, send, secsEastOfGMT, timeZoneKnown)) { 1879 return false; 1880 } 1881 } 1882 return true; 1883 } 1884 1885 bool parseQDateTime(const char *&scursor, const char *const send, 1886 QDateTime &result, bool isCRLF) 1887 { 1888 eatCFWS(scursor, send, isCRLF); 1889 if (scursor == send) { 1890 return false; 1891 } 1892 // In qt6 yy == 1900 ! => for sure we use 2000 here. 1893 result = QDateTime::fromString(QString::fromLatin1(scursor, 17), QStringLiteral("dd/MM/yy HH:mm:ss")); 1894 QDate resultDate = result.date(); 1895 resultDate.setDate(resultDate.year() + 100, resultDate.month(), resultDate.day()); 1896 result.setDate(resultDate); 1897 return result.isValid(); 1898 } 1899 1900 bool parseDateTime(const char *&scursor, const char *const send, 1901 QDateTime &result, bool isCRLF) 1902 { 1903 // Parsing date-time; strict mode: 1904 // 1905 // date-time := [ [CFWS] day-name [CFWS] "," ] ; wday 1906 // (expanded) [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date 1907 // time 1908 // 1909 // day-name := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun" 1910 // month-name := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" / 1911 // "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec" 1912 1913 result = QDateTime(); 1914 1915 eatCFWS(scursor, send, isCRLF); 1916 if (scursor == send) { 1917 return false; 1918 } 1919 1920 // 1921 // let's see if there's a day-of-week: 1922 // 1923 if (parseDayName(scursor, send)) { 1924 eatCFWS(scursor, send, isCRLF); 1925 if (scursor == send) { 1926 return false; 1927 } 1928 // day-name should be followed by ',' but we treat it as optional: 1929 if (*scursor == ',') { 1930 scursor++; // eat ',' 1931 eatCFWS(scursor, send, isCRLF); 1932 } 1933 } 1934 1935 int maybeMonth = -1; 1936 bool asctimeFormat = false; 1937 1938 // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993 1939 if (!isdigit(*scursor) && parseMonthName(scursor, send, maybeMonth)) { 1940 asctimeFormat = true; 1941 eatCFWS(scursor, send, isCRLF); 1942 } 1943 1944 // 1945 // 1*2DIGIT representing "day" (of month): 1946 // 1947 int maybeDay; 1948 if (!parseDigits(scursor, send, maybeDay)) { 1949 return false; 1950 } 1951 1952 eatCFWS(scursor, send, isCRLF); 1953 if (scursor == send) { 1954 return false; 1955 } 1956 1957 // ignore ","; bug 54098 1958 if (*scursor == ',') { 1959 scursor++; 1960 } 1961 1962 // 1963 // month-name: 1964 // 1965 if (!asctimeFormat && !parseMonthName(scursor, send, maybeMonth)) { 1966 return false; 1967 } 1968 if (scursor == send) { 1969 return false; 1970 } 1971 assert(maybeMonth >= 0); assert(maybeMonth <= 11); 1972 ++maybeMonth; // 0-11 -> 1-12 1973 1974 eatCFWS(scursor, send, isCRLF); 1975 if (scursor == send) { 1976 return false; 1977 } 1978 1979 // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS") 1980 bool timeAfterYear = true; 1981 if ((send - scursor > 3) && ((scursor[1] == ':') || (scursor[2] == ':'))) { 1982 timeAfterYear = false; // first read time, then year 1983 } 1984 1985 // 1986 // 2*DIGIT representing "year": 1987 // 1988 int maybeYear = 0; 1989 1990 if (timeAfterYear && !parseDigits(scursor, send, maybeYear)) { 1991 return false; 1992 } 1993 1994 eatCFWS(scursor, send, isCRLF); 1995 int maybeHour; 1996 int maybeMinute; 1997 int maybeSecond; 1998 long int secsEastOfGMT = 0; 1999 QDate maybeDate; 2000 QTime maybeTime; 2001 if (scursor != send) { 2002 // 2003 // time 2004 // 2005 bool timeZoneKnown = true; 2006 2007 if (!parseTime(scursor, send, 2008 maybeHour, maybeMinute, maybeSecond, 2009 secsEastOfGMT, timeZoneKnown, isCRLF)) { 2010 return false; 2011 } 2012 2013 // in asctime() the year follows the time 2014 if (!timeAfterYear) { 2015 eatCFWS(scursor, send, isCRLF); 2016 if (scursor == send) { 2017 return false; 2018 } 2019 2020 if (!parseDigits(scursor, send, maybeYear)) { 2021 return false; 2022 } 2023 } 2024 2025 // RFC 2822 4.3 processing: 2026 if (maybeYear < 50) { 2027 maybeYear += 2000; 2028 } else if (maybeYear < 1000) { 2029 maybeYear += 1900; 2030 } 2031 // else keep as is 2032 if (maybeYear < 1900) { 2033 return false; // rfc2822, 3.3 2034 } 2035 2036 maybeDate = QDate(maybeYear, maybeMonth, maybeDay); 2037 maybeTime = QTime(maybeHour, maybeMinute, maybeSecond); 2038 2039 if (!maybeDate.isValid() || !maybeTime.isValid()) { 2040 return false; 2041 } 2042 } else { 2043 maybeDate = QDate(maybeYear, maybeMonth, maybeDay); 2044 maybeTime = QTime(0, 0, 0); 2045 } 2046 2047 result = QDateTime(maybeDate, maybeTime, QTimeZone::fromSecondsAheadOfUtc(secsEastOfGMT)); 2048 if (!result.isValid()) { 2049 return false; 2050 } 2051 return true; 2052 } 2053 2054 namespace { 2055 2056 Headers::Base *extractHeader(QByteArrayView head, const int headerStart, int &endOfFieldBody) 2057 { 2058 Headers::Base *header = {}; 2059 2060 int startOfFieldBody = head.indexOf(':', headerStart); 2061 if (startOfFieldBody < 0) { 2062 return nullptr; 2063 } 2064 2065 const char *rawType = head.constData() + headerStart; 2066 const size_t rawTypeLen = startOfFieldBody - headerStart; 2067 2068 startOfFieldBody++; //skip the ':' 2069 if (startOfFieldBody < head.size() - 1 && head[startOfFieldBody] == ' ') { // skip the space after the ':', if there's any 2070 startOfFieldBody++; 2071 } 2072 2073 bool folded = false; 2074 endOfFieldBody = findHeaderLineEnd(head, startOfFieldBody, &folded); 2075 2076 // We might get an invalid mail without a field name, don't crash on that. 2077 if (rawTypeLen > 0) { 2078 header = HeaderFactory::createHeader(rawType, rawTypeLen); 2079 } 2080 if (!header) { 2081 //qCWarning(KMIME_LOG)() << "Returning Generic header of type" << rawType; 2082 header = new Headers::Generic(rawType, rawTypeLen); 2083 } 2084 if (folded) { 2085 const auto unfoldedBody = unfoldHeader(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody); 2086 header->from7BitString(unfoldedBody); 2087 } else { 2088 header->from7BitString(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody); 2089 } 2090 2091 return header; 2092 } 2093 2094 } 2095 2096 std::unique_ptr<KMime::Headers::Base> parseNextHeader(QByteArrayView &head) 2097 { 2098 int endOfFieldBody = 0; 2099 std::unique_ptr<KMime::Headers::Base> header(extractHeader(head, 0, endOfFieldBody)); 2100 if (header) { 2101 head = head.mid(endOfFieldBody + 1); 2102 } else { 2103 head = {}; 2104 } 2105 2106 return header; 2107 } 2108 2109 void extractHeaderAndBody(const QByteArray &content, QByteArray &header, QByteArray &body) 2110 { 2111 header.clear(); 2112 body.clear(); 2113 2114 // empty header 2115 if (content.startsWith('\n')) { 2116 body = content.right(content.length() - 1); 2117 return; 2118 } 2119 2120 int pos = content.indexOf("\n\n", 0); 2121 if (pos > -1) { 2122 header = content.left(++pos); //header *must* end with "\n" !! 2123 body = content.mid(pos + 1); 2124 if (body.startsWith("\n")) { 2125 body = "\n" + body; 2126 } 2127 } else { 2128 header = content; 2129 } 2130 } 2131 2132 QList<Headers::Base *> parseHeaders(const QByteArray &head) { 2133 QList<Headers::Base *> ret; 2134 2135 int cursor = 0; 2136 while (cursor < head.size()) { 2137 const int headerStart = cursor; 2138 int endOfFieldBody; 2139 if (auto header = extractHeader(head, headerStart, endOfFieldBody)) { 2140 ret << header; 2141 cursor = endOfFieldBody + 1; 2142 } else { 2143 break; 2144 } 2145 } 2146 2147 return ret; 2148 } 2149 2150 } // namespace HeaderParsing 2151 2152 } // namespace KMime