File indexing completed on 2024-03-24 16:53:06
0001 /* -*- c++ -*- 0002 kmime_header_parsing.cpp 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 0010 #include "kmime_header_parsing.h" 0011 0012 #include "kmime_headerfactory_p.h" 0013 #include "kmime_headers.h" 0014 #include "kmime_util.h" 0015 #include "kmime_util_p.h" 0016 #include "kmime_codecs_p.h" 0017 #include "kmime_dateformatter.h" 0018 #include "kmime_debug.h" 0019 #include "kmime_warning_p.h" 0020 0021 #include <KCodecs> 0022 0023 #include <QMap> 0024 #include <QStringDecoder> 0025 #include <QTimeZone> 0026 0027 #include <cassert> 0028 #include <cctype> // for isdigit 0029 0030 using namespace KMime; 0031 using namespace KMime::Types; 0032 0033 namespace KMime 0034 { 0035 0036 namespace Types 0037 { 0038 // Optimization to avoid allocating QStrings when the value isn't encoded 0039 struct KMIME_EXPORT QStringOrQPair { 0040 QStringOrQPair() : qstring(), qpair(nullptr, 0) {} 0041 QString qstring; 0042 QPair<const char *, int> qpair; 0043 }; 0044 } // namespace Types 0045 0046 namespace HeaderParsing 0047 { 0048 0049 // parse the encoded-word (scursor points to after the initial '=') 0050 bool parseEncodedWord(const char *&scursor, const char *const send, 0051 QString &result, QByteArray &language, 0052 QByteArray &usedCS, const QByteArray &defaultCS) 0053 { 0054 // make sure the caller already did a bit of the work. 0055 assert(*(scursor - 1) == '='); 0056 0057 // 0058 // STEP 1: 0059 // scan for the charset/language portion of the encoded-word 0060 // 0061 0062 char ch = *scursor++; 0063 0064 if (ch != '?') { 0065 // qCDebug(KMIME_LOG) << "first"; 0066 //KMIME_WARN_PREMATURE_END_OF( EncodedWord ); 0067 return false; 0068 } 0069 0070 // remember start of charset (ie. just after the initial "=?") and 0071 // language (just after the first '*') fields: 0072 const char *charsetStart = scursor; 0073 const char *languageStart = nullptr; 0074 0075 // find delimiting '?' (and the '*' separating charset and language 0076 // tags, if any): 0077 for (; scursor != send ; scursor++) { 0078 if (*scursor == '?') { 0079 break; 0080 } else if (*scursor == '*' && languageStart == nullptr) { 0081 languageStart = scursor + 1; 0082 } 0083 } 0084 0085 // not found? can't be an encoded-word! 0086 if (scursor == send || *scursor != '?') { 0087 // qCDebug(KMIME_LOG) << "second"; 0088 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0089 return false; 0090 } 0091 0092 // extract the language information, if any (if languageStart is 0, 0093 // language will be null, too): 0094 QByteArray maybeLanguage(languageStart, scursor - languageStart); 0095 // extract charset information (keep in mind: the size given to the 0096 // ctor is one off due to the \0 terminator): 0097 QByteArray maybeCharset(charsetStart, 0098 (languageStart ? languageStart - 1 : scursor) - charsetStart); 0099 0100 // 0101 // STEP 2: 0102 // scan for the encoding portion of the encoded-word 0103 // 0104 0105 // remember start of encoding (just _after_ the second '?'): 0106 scursor++; 0107 const char *encodingStart = scursor; 0108 0109 // find next '?' (ending the encoding tag): 0110 for (; scursor != send ; scursor++) { 0111 if (*scursor == '?') { 0112 break; 0113 } 0114 } 0115 0116 // not found? Can't be an encoded-word! 0117 if (scursor == send || *scursor != '?') { 0118 // qCDebug(KMIME_LOG) << "third"; 0119 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0120 return false; 0121 } 0122 0123 // extract the encoding information: 0124 QByteArray maybeEncoding(encodingStart, scursor - encodingStart); 0125 0126 // qCDebug(KMIME_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset 0127 // << "\"; language == \"" << maybeLanguage 0128 // << "\"; encoding == \"" << maybeEncoding << "\""; 0129 0130 // 0131 // STEP 3: 0132 // scan for encoded-text portion of encoded-word 0133 // 0134 0135 // remember start of encoded-text (just after the third '?'): 0136 scursor++; 0137 const char *encodedTextStart = scursor; 0138 0139 // find the '?=' sequence (ending the encoded-text): 0140 for (; scursor != send ; scursor++) { 0141 if (*scursor == '?') { 0142 if (scursor + 1 != send) { 0143 if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore 0144 KMIME_WARN << "Stray '?' in q-encoded word, ignoring this."; 0145 continue; 0146 } else { // yep, found a '?=' sequence 0147 scursor += 2; 0148 break; 0149 } 0150 } else { // The '?' is the last char, but we need a '=' after it! 0151 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0152 return false; 0153 } 0154 } 0155 } 0156 0157 if (*(scursor - 2) != '?' || *(scursor - 1) != '=' || 0158 scursor < encodedTextStart + 2) { 0159 KMIME_WARN_PREMATURE_END_OF(EncodedWord); 0160 return false; 0161 } 0162 0163 // set end sentinel for encoded-text: 0164 const char *const encodedTextEnd = scursor - 2; 0165 0166 // 0167 // STEP 4: 0168 // setup decoders for the transfer encoding and the charset 0169 // 0170 0171 // try if there's a codec for the encoding found: 0172 KCodecs::Codec *codec = KCodecs::Codec::codecForName(maybeEncoding); 0173 if (!codec) { 0174 KMIME_WARN_UNKNOWN(Encoding, maybeEncoding); 0175 return false; 0176 } 0177 0178 // get an instance of a corresponding decoder: 0179 KCodecs::Decoder *dec = codec->makeDecoder(); 0180 assert(dec); 0181 0182 // try if there's a (text)codec for the charset found: 0183 QStringDecoder textCodec; 0184 if (maybeCharset.isEmpty()) { 0185 textCodec = QStringDecoder(defaultCS.constData()); 0186 if (!textCodec.isValid()) { 0187 textCodec = QStringDecoder(QStringDecoder::Latin1); 0188 } 0189 usedCS = cachedCharset(defaultCS); 0190 } else { 0191 textCodec = QStringDecoder(maybeCharset.constData()); 0192 if (textCodec.isValid()) { //no suitable codec found => use default charset 0193 usedCS = cachedCharset(defaultCS); 0194 } else { 0195 textCodec = QStringDecoder(QStringDecoder::Latin1); 0196 usedCS = cachedCharset(maybeCharset); 0197 } 0198 } 0199 0200 if (!textCodec.isValid()) { 0201 KMIME_WARN_UNKNOWN(Charset, maybeCharset); 0202 delete dec; 0203 return false; 0204 }; 0205 0206 // qCDebug(KMIME_LOG) << "mimeName(): \"" << textCodec->name() << "\""; 0207 0208 // allocate a temporary buffer to store the 8bit text: 0209 int encodedTextLength = encodedTextEnd - encodedTextStart; 0210 QByteArray buffer; 0211 buffer.resize(codec->maxDecodedSizeFor(encodedTextLength)); 0212 char *bbegin = buffer.data(); 0213 char *bend = bbegin + buffer.length(); 0214 0215 // 0216 // STEP 5: 0217 // do the actual decoding 0218 // 0219 0220 if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) { 0221 KMIME_WARN << codec->name() << "codec lies about its maxDecodedSizeFor(" 0222 << encodedTextLength << ")\nresult may be truncated"; 0223 } 0224 0225 result = textCodec.decode(QByteArrayView(buffer.data(), bbegin - buffer.data())); 0226 0227 // qCDebug(KMIME_LOG) << "result now: \"" << result << "\""; 0228 // cleanup: 0229 delete dec; 0230 language = maybeLanguage; 0231 0232 return true; 0233 } 0234 0235 static inline void eatWhiteSpace(const char *&scursor, const char *const send) 0236 { 0237 while (scursor != send && 0238 (*scursor == ' ' || *scursor == '\n' || 0239 *scursor == '\t' || *scursor == '\r')) { 0240 scursor++; 0241 } 0242 } 0243 0244 bool parseAtom(const char*&scursor, const char *const send, 0245 QByteArray &result, bool allow8Bit) 0246 { 0247 QPair<const char *, int> maybeResult; 0248 0249 if (parseAtom(scursor, send, maybeResult, allow8Bit)) { 0250 result = QByteArray(maybeResult.first, maybeResult.second); 0251 return true; 0252 } 0253 0254 return false; 0255 } 0256 0257 bool parseAtom(const char*&scursor, const char *const send, 0258 QPair<const char *, int> &result, bool allow8Bit) 0259 { 0260 bool success = false; 0261 const char *start = scursor; 0262 0263 while (scursor != send) { 0264 signed char ch = *scursor++; 0265 if (ch > 0 && isAText(ch)) { 0266 // AText: OK 0267 success = true; 0268 } else if (allow8Bit && ch < 0) { 0269 // 8bit char: not OK, but be tolerant. 0270 KMIME_WARN_8BIT(ch); 0271 success = true; 0272 } else { 0273 // CTL or special - marking the end of the atom: 0274 // re-set sursor to point to the offending 0275 // char and return: 0276 scursor--; 0277 break; 0278 } 0279 } 0280 result.first = start; 0281 result.second = scursor - start; 0282 return success; 0283 } 0284 0285 bool parseToken(const char*&scursor, const char *const send, 0286 QByteArray &result, ParseTokenFlags flags) 0287 { 0288 QPair<const char *, int> maybeResult; 0289 0290 if (parseToken(scursor, send, maybeResult, flags)) { 0291 result = QByteArray(maybeResult.first, maybeResult.second); 0292 return true; 0293 } 0294 0295 return false; 0296 } 0297 0298 bool parseToken(const char*&scursor, const char *const send, 0299 QPair<const char *, int> &result, ParseTokenFlags flags) 0300 { 0301 bool success = false; 0302 const char *start = scursor; 0303 0304 while (scursor != send) { 0305 signed char ch = *scursor++; 0306 if (ch > 0 && isTText(ch)) { 0307 // TText: OK 0308 success = true; 0309 } else if ((flags & ParseTokenAllow8Bit) && ch < 0) { 0310 // 8bit char: not OK, but be tolerant. 0311 KMIME_WARN_8BIT(ch); 0312 success = true; 0313 } else if ((flags & ParseTokenRelaxedTText) && ch == '/') { 0314 success = true; 0315 } else { 0316 // CTL or tspecial - marking the end of the atom: 0317 // re-set sursor to point to the offending 0318 // char and return: 0319 scursor--; 0320 break; 0321 } 0322 } 0323 result.first = start; 0324 result.second = scursor - start; 0325 return success; 0326 } 0327 0328 #define READ_ch_OR_FAIL if ( scursor == send ) { \ 0329 KMIME_WARN_PREMATURE_END_OF( GenericQuotedString ); \ 0330 return false; \ 0331 } else { \ 0332 ch = *scursor++; \ 0333 } 0334 0335 // known issues: 0336 // 0337 // - doesn't handle quoted CRLF 0338 0339 bool parseGenericQuotedString(const char *&scursor, const char *const send, 0340 QString &result, bool isCRLF, 0341 const char openChar, const char closeChar) 0342 { 0343 // We are in a quoted-string or domain-literal or comment and the 0344 // cursor points to the first char after the openChar. 0345 // We will apply unfolding and quoted-pair removal. 0346 // We return when we either encounter the end or unescaped openChar 0347 // or closeChar. 0348 assert(*(scursor - 1) == openChar || *(scursor - 1) == closeChar); 0349 0350 while (scursor != send) { 0351 char ch = *scursor++; 0352 0353 if (ch == closeChar || ch == openChar) { 0354 // end of quoted-string or another opening char: 0355 // let caller decide what to do. 0356 return true; 0357 } 0358 0359 switch (ch) { 0360 case '\\': // quoted-pair 0361 // misses "\" CRLF LWSP-char handling, see rfc822, 3.4.5 0362 READ_ch_OR_FAIL; 0363 KMIME_WARN_IF_8BIT(ch); 0364 result += QLatin1Char(ch); 0365 break; 0366 case '\r': 0367 // ### 0368 // The case of lonely '\r' is easy to solve, as they're 0369 // not part of Unix Line-ending conventions. 0370 // But I see a problem if we are given Unix-native 0371 // line-ending-mails, where we cannot determine anymore 0372 // whether a given '\n' was part of a CRLF or was occurring 0373 // on it's own. 0374 READ_ch_OR_FAIL; 0375 if (ch != '\n') { 0376 // CR on it's own... 0377 KMIME_WARN_LONE(CR); 0378 result += QLatin1Char('\r'); 0379 scursor--; // points to after the '\r' again 0380 } else { 0381 // CRLF encountered. 0382 // lookahead: check for folding 0383 READ_ch_OR_FAIL; 0384 if (ch == ' ' || ch == '\t') { 0385 // correct folding; 0386 // position cursor behind the CRLF WSP (unfolding) 0387 // and add the WSP to the result 0388 result += QLatin1Char(ch); 0389 } else { 0390 // this is the "shouldn't happen"-case. There is a CRLF 0391 // inside a quoted-string without it being part of FWS. 0392 // We take it verbatim. 0393 KMIME_WARN_NON_FOLDING(CRLF); 0394 result += QLatin1StringView("\r\n"); 0395 // the cursor is decremented again, so's we need not 0396 // duplicate the whole switch here. "ch" could've been 0397 // everything (incl. openChar or closeChar). 0398 scursor--; 0399 } 0400 } 0401 break; 0402 case '\n': 0403 // Note: CRLF has been handled above already! 0404 // ### LF needs special treatment, depending on whether isCRLF 0405 // is true (we can be sure a lonely '\n' was meant this way) or 0406 // false ('\n' alone could have meant LF or CRLF in the original 0407 // message. This parser assumes CRLF iff the LF is followed by 0408 // either WSP (folding) or NULL (premature end of quoted-string; 0409 // Should be fixed, since NULL is allowed as per rfc822). 0410 READ_ch_OR_FAIL; 0411 if (!isCRLF && (ch == ' ' || ch == '\t')) { 0412 // folding 0413 // correct folding 0414 result += QLatin1Char(ch); 0415 } else { 0416 // non-folding 0417 KMIME_WARN_LONE(LF); 0418 result += QLatin1Char('\n'); 0419 // pos is decremented, so's we need not duplicate the whole 0420 // switch here. ch could've been everything (incl. <">, "\"). 0421 scursor--; 0422 } 0423 break; 0424 case '=': { 0425 // ### Work around broken clients that send encoded words in quoted-strings 0426 // For example, older KMail versions. 0427 if (scursor == send) { 0428 break; 0429 } 0430 0431 const char *oldscursor = scursor; 0432 QString tmp; 0433 QByteArray lang; 0434 QByteArray charset; 0435 if (*scursor++ == '?') { 0436 --scursor; 0437 if (parseEncodedWord(scursor, send, tmp, lang, charset)) { 0438 result += tmp; 0439 //qDebug() << " tmp " << tmp; 0440 if (scursor == send) { 0441 break; 0442 } else if (*scursor++ == ' ') { //Workaround Bug 362650 thunderbird add space for each new line 0443 if (scursor == send) { 0444 --scursor; 0445 break; 0446 } else if (*scursor++ == '=') { 0447 if (scursor == send) { 0448 --scursor; 0449 --scursor; 0450 break; 0451 } else if (*scursor++ == '?') { 0452 --scursor; 0453 --scursor; 0454 break; 0455 } 0456 } else { 0457 --scursor; 0458 --scursor; 0459 } 0460 } else { 0461 --scursor; 0462 } 0463 0464 break; 0465 } else { 0466 scursor = oldscursor; 0467 } 0468 } else { 0469 scursor = oldscursor; 0470 } 0471 // fall through 0472 [[fallthrough]]; 0473 } 0474 default: 0475 KMIME_WARN_IF_8BIT(ch); 0476 result += QLatin1Char(ch); 0477 } 0478 } 0479 0480 return false; 0481 } 0482 0483 // known issues: 0484 // 0485 // - doesn't handle encoded-word inside comments. 0486 0487 bool parseComment(const char *&scursor, const char *const send, 0488 QString &result, bool isCRLF, bool reallySave) 0489 { 0490 int commentNestingDepth = 1; 0491 const char *afterLastClosingParenPos = nullptr; 0492 QString maybeCmnt; 0493 const char *oldscursor = scursor; 0494 0495 assert(*(scursor - 1) == '('); 0496 0497 while (commentNestingDepth) { 0498 QString cmntPart; 0499 if (parseGenericQuotedString(scursor, send, cmntPart, isCRLF, '(', ')')) { 0500 assert(*(scursor - 1) == ')' || *(scursor - 1) == '('); 0501 // see the kdoc for above function for the possible conditions 0502 // we have to check: 0503 switch (*(scursor - 1)) { 0504 case ')': 0505 if (reallySave) { 0506 // add the chunk that's now surely inside the comment. 0507 result += maybeCmnt; 0508 result += cmntPart; 0509 if (commentNestingDepth > 1) { 0510 // don't add the outermost ')'... 0511 result += QLatin1Char(')'); 0512 } 0513 maybeCmnt.clear(); 0514 } 0515 afterLastClosingParenPos = scursor; 0516 --commentNestingDepth; 0517 break; 0518 case '(': 0519 if (reallySave) { 0520 // don't add to "result" yet, because we might find that we 0521 // are already outside the (broken) comment... 0522 maybeCmnt += cmntPart; 0523 maybeCmnt += QLatin1Char('('); 0524 } 0525 ++commentNestingDepth; 0526 break; 0527 default: assert(0); 0528 } // switch 0529 } else { 0530 // !parseGenericQuotedString, ie. premature end 0531 if (afterLastClosingParenPos) { 0532 scursor = afterLastClosingParenPos; 0533 } else { 0534 scursor = oldscursor; 0535 } 0536 return false; 0537 } 0538 } // while 0539 0540 return true; 0541 } 0542 0543 // known issues: none. 0544 0545 bool parsePhrase(const char *&scursor, const char *const send, 0546 QString &result, bool isCRLF) 0547 { 0548 enum { 0549 None, Phrase, Atom, EncodedWord, QuotedString 0550 } found = None; 0551 0552 QString tmp; 0553 QByteArray lang; 0554 QByteArray charset; 0555 QPair<const char *, int> tmpAtom; 0556 const char *successfullyParsed = nullptr; 0557 // only used by the encoded-word branch 0558 const char *oldscursor; 0559 // used to suppress whitespace between adjacent encoded-words 0560 // (rfc2047, 6.2): 0561 bool lastWasEncodedWord = false; 0562 0563 while (scursor != send) { 0564 char ch = *scursor++; 0565 switch (ch) { 0566 case '.': // broken, but allow for intorop's sake 0567 if (found == None) { 0568 --scursor; 0569 return false; 0570 } else { 0571 if (scursor != send && (*scursor == ' ' || *scursor == '\t')) { 0572 result += QLatin1StringView(". "); 0573 } else { 0574 result += QLatin1Char('.'); 0575 } 0576 successfullyParsed = scursor; 0577 } 0578 break; 0579 case '"': // quoted-string 0580 tmp.clear(); 0581 if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) { 0582 successfullyParsed = scursor; 0583 assert(*(scursor - 1) == '"'); 0584 switch (found) { 0585 case None: 0586 found = QuotedString; 0587 break; 0588 case Phrase: 0589 case Atom: 0590 case EncodedWord: 0591 case QuotedString: 0592 found = Phrase; 0593 result += QLatin1Char(' '); // rfc822, 3.4.4 0594 break; 0595 default: 0596 assert(0); 0597 } 0598 lastWasEncodedWord = false; 0599 result += tmp; 0600 } else { 0601 // premature end of quoted string. 0602 // What to do? Return leading '"' as special? Return as quoted-string? 0603 // We do the latter if we already found something, else signal failure. 0604 if (found == None) { 0605 return false; 0606 } else { 0607 result += QLatin1Char(' '); // rfc822, 3.4.4 0608 result += tmp; 0609 return true; 0610 } 0611 } 0612 break; 0613 case '(': // comment 0614 // parse it, but ignore content: 0615 tmp.clear(); 0616 if (parseComment(scursor, send, tmp, isCRLF, 0617 false /*don't bother with the content*/)) { 0618 successfullyParsed = scursor; 0619 lastWasEncodedWord = false; // strictly interpreting rfc2047, 6.2 0620 } else { 0621 if (found == None) { 0622 return false; 0623 } else { 0624 scursor = successfullyParsed; 0625 return true; 0626 } 0627 } 0628 break; 0629 case '=': // encoded-word 0630 tmp.clear(); 0631 oldscursor = scursor; 0632 lang.clear(); 0633 charset.clear(); 0634 if (parseEncodedWord(scursor, send, tmp, lang, charset)) { 0635 successfullyParsed = scursor; 0636 switch (found) { 0637 case None: 0638 found = EncodedWord; 0639 break; 0640 case Phrase: 0641 case EncodedWord: 0642 case Atom: 0643 case QuotedString: 0644 if (!lastWasEncodedWord) { 0645 result += QLatin1Char(' '); // rfc822, 3.4.4 0646 } 0647 found = Phrase; 0648 break; 0649 default: assert(0); 0650 } 0651 lastWasEncodedWord = true; 0652 result += tmp; 0653 break; 0654 } else { 0655 // parse as atom: 0656 scursor = oldscursor; 0657 } 0658 [[fallthrough]]; 0659 // fall though... 0660 0661 default: //atom 0662 scursor--; 0663 if (parseAtom(scursor, send, tmpAtom, true /* allow 8bit */)) { 0664 successfullyParsed = scursor; 0665 switch (found) { 0666 case None: 0667 found = Atom; 0668 break; 0669 case Phrase: 0670 case Atom: 0671 case EncodedWord: 0672 case QuotedString: 0673 found = Phrase; 0674 result += QLatin1Char(' '); // rfc822, 3.4.4 0675 break; 0676 default: 0677 assert(0); 0678 } 0679 lastWasEncodedWord = false; 0680 result += QLatin1StringView(tmpAtom.first, tmpAtom.second); 0681 } else { 0682 if (found == None) { 0683 return false; 0684 } else { 0685 scursor = successfullyParsed; 0686 return true; 0687 } 0688 } 0689 } 0690 eatWhiteSpace(scursor, send); 0691 } 0692 0693 return found != None; 0694 } 0695 0696 bool parseDotAtom(const char *&scursor, const char *const send, 0697 QByteArray &result, bool isCRLF) 0698 { 0699 eatCFWS(scursor, send, isCRLF); 0700 0701 // always points to just after the last atom parsed: 0702 const char *successfullyParsed; 0703 0704 QByteArray tmp; 0705 if (!parseAtom(scursor, send, tmp, false /* no 8bit */)) { 0706 return false; 0707 } 0708 result += tmp; 0709 successfullyParsed = scursor; 0710 0711 while (scursor != send) { 0712 0713 // end of header or no '.' -> return 0714 if (scursor == send || *scursor != '.') { 0715 return true; 0716 } 0717 scursor++; // eat '.' 0718 0719 if (scursor == send || !isAText(*scursor)) { 0720 // end of header or no AText, but this time following a '.'!: 0721 // reset cursor to just after last successfully parsed char and 0722 // return: 0723 scursor = successfullyParsed; 0724 return true; 0725 } 0726 0727 // try to parse the next atom: 0728 QByteArray maybeAtom; 0729 if (!parseAtom(scursor, send, maybeAtom, false /*no 8bit*/)) { 0730 scursor = successfullyParsed; 0731 return true; 0732 } 0733 0734 result += '.'; 0735 result += maybeAtom; 0736 successfullyParsed = scursor; 0737 } 0738 0739 scursor = successfullyParsed; 0740 return true; 0741 } 0742 0743 void eatCFWS(const char *&scursor, const char *const send, bool isCRLF) 0744 { 0745 QString dummy; 0746 0747 while (scursor != send) { 0748 const char *oldscursor = scursor; 0749 0750 char ch = *scursor++; 0751 0752 switch (ch) { 0753 case ' ': 0754 case '\t': // whitespace 0755 case '\r': 0756 case '\n': // folding 0757 continue; 0758 0759 case '(': // comment 0760 if (parseComment(scursor, send, dummy, isCRLF, false /*don't save*/)) { 0761 continue; 0762 } 0763 scursor = oldscursor; 0764 return; 0765 0766 default: 0767 scursor = oldscursor; 0768 return; 0769 } 0770 } 0771 } 0772 0773 bool parseDomain(const char *&scursor, const char *const send, 0774 QString &result, bool isCRLF) 0775 { 0776 eatCFWS(scursor, send, isCRLF); 0777 if (scursor == send) { 0778 return false; 0779 } 0780 0781 // domain := dot-atom / domain-literal / atom *("." atom) 0782 // 0783 // equivalent to: 0784 // domain = dot-atom / domain-literal, 0785 // since parseDotAtom does allow CFWS between atoms and dots 0786 0787 if (*scursor == '[') { 0788 // domain-literal: 0789 QString maybeDomainLiteral; 0790 // eat '[': 0791 scursor++; 0792 while (parseGenericQuotedString(scursor, send, maybeDomainLiteral, 0793 isCRLF, '[', ']')) { 0794 if (scursor == send) { 0795 // end of header: check for closing ']': 0796 if (*(scursor - 1) == ']') { 0797 // OK, last char was ']': 0798 result = maybeDomainLiteral; 0799 return true; 0800 } else { 0801 // not OK, domain-literal wasn't closed: 0802 return false; 0803 } 0804 } 0805 // we hit openChar in parseGenericQuotedString. 0806 // include it in maybeDomainLiteral and keep on parsing: 0807 if (*(scursor - 1) == '[') { 0808 maybeDomainLiteral += QLatin1Char('['); 0809 continue; 0810 } 0811 // OK, real end of domain-literal: 0812 result = maybeDomainLiteral; 0813 return true; 0814 } 0815 } else { 0816 // dot-atom: 0817 QByteArray maybeDotAtom; 0818 if (parseDotAtom(scursor, send, maybeDotAtom, isCRLF)) { 0819 // Domain may end with '.', if so preserve it' 0820 if (scursor != send && *scursor == '.') { 0821 maybeDotAtom += '.'; 0822 scursor++; 0823 } 0824 result = QString::fromLatin1(maybeDotAtom); 0825 return true; 0826 } 0827 } 0828 return false; 0829 } 0830 0831 bool parseObsRoute(const char *&scursor, const char *const send, 0832 QStringList &result, bool isCRLF, bool save) 0833 { 0834 while (scursor != send) { 0835 eatCFWS(scursor, send, isCRLF); 0836 if (scursor == send) { 0837 return false; 0838 } 0839 0840 // empty entry: 0841 if (*scursor == ',') { 0842 scursor++; 0843 if (save) { 0844 result.append(QString()); 0845 } 0846 continue; 0847 } 0848 0849 // empty entry ending the list: 0850 if (*scursor == ':') { 0851 scursor++; 0852 if (save) { 0853 result.append(QString()); 0854 } 0855 return true; 0856 } 0857 0858 // each non-empty entry must begin with '@': 0859 if (*scursor != '@') { 0860 return false; 0861 } else { 0862 scursor++; 0863 } 0864 0865 QString maybeDomain; 0866 if (!parseDomain(scursor, send, maybeDomain, isCRLF)) { 0867 return false; 0868 } 0869 if (save) { 0870 result.append(maybeDomain); 0871 } 0872 0873 // eat the following (optional) comma: 0874 eatCFWS(scursor, send, isCRLF); 0875 if (scursor == send) { 0876 return false; 0877 } 0878 if (*scursor == ':') { 0879 scursor++; 0880 return true; 0881 } 0882 if (*scursor == ',') { 0883 scursor++; 0884 } 0885 } 0886 0887 return false; 0888 } 0889 0890 bool parseAddrSpec(const char *&scursor, const char *const send, 0891 AddrSpec &result, bool isCRLF) 0892 { 0893 // 0894 // STEP 1: 0895 // local-part := dot-atom / quoted-string / word *("." word) 0896 // 0897 // this is equivalent to: 0898 // local-part := word *("." word) 0899 0900 QString maybeLocalPart; 0901 QString tmp; 0902 QPair<const char *, int> tmpAtom; 0903 0904 while (scursor != send) { 0905 // first, eat any whitespace 0906 eatCFWS(scursor, send, isCRLF); 0907 0908 char ch = *scursor++; 0909 switch (ch) { 0910 case '.': // dot 0911 maybeLocalPart += QLatin1Char('.'); 0912 break; 0913 0914 case '@': 0915 goto SAW_AT_SIGN; 0916 break; 0917 0918 case '"': // quoted-string 0919 tmp.clear(); 0920 if (parseGenericQuotedString(scursor, send, tmp, isCRLF, '"', '"')) { 0921 maybeLocalPart += tmp; 0922 } else { 0923 return false; 0924 } 0925 break; 0926 0927 default: // atom 0928 scursor--; // re-set scursor to point to ch again 0929 if (parseAtom(scursor, send, tmpAtom, false /* no 8bit */)) { 0930 maybeLocalPart += 0931 QLatin1StringView(tmpAtom.first, tmpAtom.second); 0932 } else { 0933 return false; // parseAtom can only fail if the first char is non-atext. 0934 } 0935 break; 0936 } 0937 } 0938 0939 return false; 0940 0941 // 0942 // STEP 2: 0943 // domain 0944 // 0945 0946 SAW_AT_SIGN: 0947 0948 assert(*(scursor - 1) == '@'); 0949 0950 QString maybeDomain; 0951 if (!parseDomain(scursor, send, maybeDomain, isCRLF)) { 0952 return false; 0953 } 0954 0955 result.localPart = maybeLocalPart; 0956 result.domain = maybeDomain; 0957 0958 return true; 0959 } 0960 0961 bool parseAngleAddr(const char *&scursor, const char *const send, 0962 AddrSpec &result, bool isCRLF) 0963 { 0964 // first, we need an opening angle bracket: 0965 eatCFWS(scursor, send, isCRLF); 0966 if (scursor == send || *scursor != '<') { 0967 return false; 0968 } 0969 scursor++; // eat '<' 0970 0971 eatCFWS(scursor, send, isCRLF); 0972 if (scursor == send) { 0973 return false; 0974 } 0975 0976 if (*scursor == '@' || *scursor == ',') { 0977 // obs-route: parse, but ignore: 0978 KMIME_WARN << "obsolete source route found! ignoring."; 0979 QStringList dummy; 0980 if (!parseObsRoute(scursor, send, dummy, 0981 isCRLF, false /* don't save */)) { 0982 return false; 0983 } 0984 // angle-addr isn't complete until after the '>': 0985 if (scursor == send) { 0986 return false; 0987 } 0988 } 0989 0990 // parse addr-spec: 0991 AddrSpec maybeAddrSpec; 0992 if (!parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) { 0993 return false; 0994 } 0995 0996 eatCFWS(scursor, send, isCRLF); 0997 if (scursor == send || *scursor != '>') { 0998 return false; 0999 } 1000 scursor++; 1001 1002 result = maybeAddrSpec; 1003 return true; 1004 1005 } 1006 1007 static QString stripQuotes(const QString &input) 1008 { 1009 const QLatin1Char quotes('"'); 1010 if (input.startsWith(quotes) && input.endsWith(quotes)) { 1011 QString stripped(input.mid(1, input.size() - 2)); 1012 return stripped; 1013 } else { 1014 return input; 1015 } 1016 } 1017 1018 bool parseMailbox(const char *&scursor, const char *const send, 1019 Mailbox &result, bool isCRLF) 1020 { 1021 eatCFWS(scursor, send, isCRLF); 1022 if (scursor == send) { 1023 return false; 1024 } 1025 1026 AddrSpec maybeAddrSpec; 1027 QString maybeDisplayName; 1028 1029 // first, try if it's a vanilla addr-spec: 1030 const char *oldscursor = scursor; 1031 if (parseAddrSpec(scursor, send, maybeAddrSpec, isCRLF)) { 1032 result.setAddress(maybeAddrSpec); 1033 // check for the obsolete form of display-name (as comment): 1034 eatWhiteSpace(scursor, send); 1035 if (scursor != send && *scursor == '(') { 1036 scursor++; 1037 if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) { 1038 return false; 1039 } 1040 } 1041 result.setName(stripQuotes(maybeDisplayName)); 1042 return true; 1043 } 1044 scursor = oldscursor; 1045 1046 // second, see if there's a display-name: 1047 if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) { 1048 // failed: reset cursor, note absent display-name 1049 maybeDisplayName.clear(); 1050 scursor = oldscursor; 1051 } else { 1052 // succeeded: eat CFWS 1053 eatCFWS(scursor, send, isCRLF); 1054 if (scursor == send) { 1055 return false; 1056 } 1057 } 1058 1059 // third, parse the angle-addr: 1060 if (!parseAngleAddr(scursor, send, maybeAddrSpec, isCRLF)) { 1061 return false; 1062 } 1063 1064 if (maybeDisplayName.isNull()) { 1065 // check for the obsolete form of display-name (as comment): 1066 eatWhiteSpace(scursor, send); 1067 if (scursor != send && *scursor == '(') { 1068 scursor++; 1069 if (!parseComment(scursor, send, maybeDisplayName, isCRLF, true /*keep*/)) { 1070 return false; 1071 } 1072 } 1073 } 1074 1075 result.setName(stripQuotes(maybeDisplayName)); 1076 result.setAddress(maybeAddrSpec); 1077 return true; 1078 } 1079 1080 bool parseGroup(const char *&scursor, const char *const send, 1081 Address &result, bool isCRLF) 1082 { 1083 // group := display-name ":" [ mailbox-list / CFWS ] ";" [CFWS] 1084 // 1085 // equivalent to: 1086 // group := display-name ":" [ obs-mbox-list ] ";" 1087 1088 eatCFWS(scursor, send, isCRLF); 1089 if (scursor == send) { 1090 return false; 1091 } 1092 1093 // get display-name: 1094 QString maybeDisplayName; 1095 if (!parsePhrase(scursor, send, maybeDisplayName, isCRLF)) { 1096 return false; 1097 } 1098 1099 // get ":": 1100 eatCFWS(scursor, send, isCRLF); 1101 if (scursor == send || *scursor != ':') { 1102 return false; 1103 } 1104 1105 // KDE5 TODO: Don't expose displayName as public, but rather add setter for it that 1106 // automatically calls removeBidiControlChars 1107 result.displayName = removeBidiControlChars(maybeDisplayName); 1108 1109 // get obs-mbox-list (may contain empty entries): 1110 scursor++; 1111 while (scursor != send) { 1112 eatCFWS(scursor, send, isCRLF); 1113 if (scursor == send) { 1114 return false; 1115 } 1116 1117 // empty entry: 1118 if (*scursor == ',') { 1119 scursor++; 1120 continue; 1121 } 1122 1123 // empty entry ending the list: 1124 if (*scursor == ';') { 1125 scursor++; 1126 return true; 1127 } 1128 1129 Mailbox maybeMailbox; 1130 if (!parseMailbox(scursor, send, maybeMailbox, isCRLF)) { 1131 return false; 1132 } 1133 result.mailboxList.append(maybeMailbox); 1134 1135 eatCFWS(scursor, send, isCRLF); 1136 // premature end: 1137 if (scursor == send) { 1138 return false; 1139 } 1140 // regular end of the list: 1141 if (*scursor == ';') { 1142 scursor++; 1143 return true; 1144 } 1145 // eat regular list entry separator: 1146 if (*scursor == ',') { 1147 scursor++; 1148 } 1149 } 1150 return false; 1151 } 1152 1153 bool parseAddress(const char *&scursor, const char *const send, 1154 Address &result, bool isCRLF) 1155 { 1156 // address := mailbox / group 1157 1158 eatCFWS(scursor, send, isCRLF); 1159 if (scursor == send) { 1160 return false; 1161 } 1162 1163 // first try if it's a single mailbox: 1164 Mailbox maybeMailbox; 1165 const char *oldscursor = scursor; 1166 if (parseMailbox(scursor, send, maybeMailbox, isCRLF)) { 1167 // yes, it is: 1168 result.displayName.clear(); 1169 result.mailboxList.append(maybeMailbox); 1170 return true; 1171 } 1172 scursor = oldscursor; 1173 1174 Address maybeAddress; 1175 1176 // no, it's not a single mailbox. Try if it's a group: 1177 if (!parseGroup(scursor, send, maybeAddress, isCRLF)) { 1178 return false; 1179 } 1180 1181 result = maybeAddress; 1182 return true; 1183 } 1184 1185 bool parseAddressList(const char *&scursor, const char *const send, 1186 AddressList &result, bool isCRLF) 1187 { 1188 while (scursor != send) { 1189 eatCFWS(scursor, send, isCRLF); 1190 // end of header: this is OK. 1191 if (scursor == send) { 1192 return true; 1193 } 1194 // empty entry: ignore: 1195 if (*scursor == ',') { 1196 scursor++; 1197 continue; 1198 } 1199 // broken clients might use ';' as list delimiter, accept that as well 1200 if (*scursor == ';') { 1201 scursor++; 1202 continue; 1203 } 1204 1205 // parse one entry 1206 Address maybeAddress; 1207 if (!parseAddress(scursor, send, maybeAddress, isCRLF)) { 1208 return false; 1209 } 1210 result.append(maybeAddress); 1211 1212 eatCFWS(scursor, send, isCRLF); 1213 // end of header: this is OK. 1214 if (scursor == send) { 1215 return true; 1216 } 1217 // comma separating entries: eat it. 1218 if (*scursor == ',') { 1219 scursor++; 1220 } 1221 } 1222 return true; 1223 } 1224 1225 static bool parseParameter(const char *&scursor, const char *const send, 1226 QPair<QString, QStringOrQPair> &result, bool isCRLF) 1227 { 1228 // parameter = regular-parameter / extended-parameter 1229 // regular-parameter = regular-parameter-name "=" value 1230 // extended-parameter = 1231 // value = token / quoted-string 1232 // 1233 // note that rfc2231 handling is out of the scope of this function. 1234 // Therefore we return the attribute as QByteArray and the value as 1235 // (start,length) tuple if we see that the value is encoded 1236 // (trailing asterisk), for parseParameterList to decode... 1237 1238 eatCFWS(scursor, send, isCRLF); 1239 if (scursor == send) { 1240 return false; 1241 } 1242 1243 // 1244 // parse the parameter name: 1245 // 1246 QByteArray tmpAttr; 1247 if (!parseToken(scursor, send, tmpAttr, ParseTokenNoFlag)) { 1248 return false; 1249 } 1250 // FIXME: we could use QMap<QByteArray, ...> in the API for parameters 1251 QString maybeAttribute = QString::fromLatin1(tmpAttr); 1252 1253 eatCFWS(scursor, send, isCRLF); 1254 // premature end: not OK (haven't seen '=' yet). 1255 if (scursor == send || *scursor != '=') { 1256 return false; 1257 } 1258 scursor++; // eat '=' 1259 1260 eatCFWS(scursor, send, isCRLF); 1261 if (scursor == send) { 1262 // don't choke on attribute=, meaning the value was omitted: 1263 if (maybeAttribute.endsWith(QLatin1Char('*'))) { 1264 KMIME_WARN << "attribute ends with \"*\", but value is empty!" 1265 "Chopping away \"*\"."; 1266 maybeAttribute.chop(1); 1267 } 1268 result = qMakePair(maybeAttribute.toLower(), QStringOrQPair()); 1269 return true; 1270 } 1271 1272 const char *oldscursor = scursor; 1273 1274 // 1275 // parse the parameter value: 1276 // 1277 QStringOrQPair maybeValue; 1278 if (*scursor == '"') { 1279 // value is a quoted-string: 1280 scursor++; 1281 if (maybeAttribute.endsWith(QLatin1Char('*'))) { 1282 // attributes ending with "*" designate extended-parameters, 1283 // which cannot have quoted-strings as values. So we remove the 1284 // trailing "*" to not confuse upper layers. 1285 KMIME_WARN << "attribute ends with \"*\", but value is a quoted-string!" 1286 "Chopping away \"*\"."; 1287 maybeAttribute.chop(1); 1288 } 1289 1290 if (!parseGenericQuotedString(scursor, send, maybeValue.qstring, isCRLF)) { 1291 scursor = oldscursor; 1292 result = qMakePair(maybeAttribute.toLower(), QStringOrQPair()); 1293 return false; // this case needs further processing by upper layers!! 1294 } 1295 } else { 1296 // value is a token: 1297 if (!parseToken(scursor, send, maybeValue.qpair, ParseTokenRelaxedTText)) { 1298 scursor = oldscursor; 1299 result = qMakePair(maybeAttribute.toLower(), QStringOrQPair()); 1300 return false; // this case needs further processing by upper layers!! 1301 } 1302 } 1303 1304 result = qMakePair(maybeAttribute.toLower(), maybeValue); 1305 return true; 1306 } 1307 1308 static bool parseRawParameterList(const char *&scursor, const char *const send, 1309 QMap<QString, QStringOrQPair> &result, 1310 bool isCRLF) 1311 { 1312 // we use parseParameter() consecutively to obtain a map of raw 1313 // attributes to raw values. "Raw" here means that we don't do 1314 // rfc2231 decoding and concatenation. This is left to 1315 // parseParameterList(), which will call this function. 1316 // 1317 // The main reason for making this chunk of code a separate 1318 // (private) method is that we can deal with broken parameters 1319 // _here_ and leave the rfc2231 handling solely to 1320 // parseParameterList(), which will still be enough work. 1321 while (scursor != send) { 1322 eatCFWS(scursor, send, isCRLF); 1323 // empty entry ending the list: OK. 1324 if (scursor == send) { 1325 return true; 1326 } 1327 // empty list entry: ignore. 1328 if (*scursor == ';') { 1329 scursor++; 1330 continue; 1331 } 1332 QPair<QString, QStringOrQPair> maybeParameter; 1333 if (!parseParameter(scursor, send, maybeParameter, isCRLF)) { 1334 // we need to do a bit of work if the attribute is not 1335 // NULL. These are the cases marked with "needs further 1336 // processing" in parseParameter(). Specifically, parsing of the 1337 // token or the quoted-string, which should represent the value, 1338 // failed. We take the easy way out and simply search for the 1339 // next ';' to start parsing again. (Another option would be to 1340 // take the text between '=' and ';' as value) 1341 if (maybeParameter.first.isNull()) { 1342 return false; 1343 } 1344 while (scursor != send) { 1345 if (*scursor++ == ';') { 1346 goto IS_SEMICOLON; 1347 } 1348 } 1349 // scursor == send case: end of list. 1350 return true; 1351 IS_SEMICOLON: 1352 // *scursor == ';' case: parse next entry. 1353 continue; 1354 } 1355 // successful parsing brings us here: 1356 result.insert(maybeParameter.first, maybeParameter.second); 1357 1358 eatCFWS(scursor, send, isCRLF); 1359 // end of header: ends list. 1360 if (scursor == send) { 1361 return true; 1362 } 1363 // regular separator: eat it. 1364 if (*scursor == ';') { 1365 scursor++; 1366 } 1367 } 1368 return true; 1369 } 1370 1371 static void decodeRFC2231Value(KCodecs::Codec *&rfc2231Codec, 1372 QStringDecoder &textcodec, 1373 bool isContinuation, QString &value, 1374 QPair<const char *, int> &source, QByteArray &charset) 1375 { 1376 // 1377 // parse the raw value into (charset,language,text): 1378 // 1379 1380 const char *decBegin = source.first; 1381 const char *decCursor = decBegin; 1382 const char *decEnd = decCursor + source.second; 1383 1384 if (!isContinuation) { 1385 // find the first single quote 1386 while (decCursor != decEnd) { 1387 if (*decCursor == '\'') { 1388 break; 1389 } else { 1390 decCursor++; 1391 } 1392 } 1393 1394 if (decCursor == decEnd) { 1395 // there wasn't a single single quote at all! 1396 // take the whole value to be in latin-1: 1397 KMIME_WARN << "No charset in extended-initial-value." 1398 "Assuming \"iso-8859-1\"."; 1399 value += QString::fromLatin1(decBegin, source.second); 1400 return; 1401 } 1402 1403 charset = QByteArray(decBegin, decCursor - decBegin); 1404 1405 const char *oldDecCursor = ++decCursor; 1406 // find the second single quote (we ignore the language tag): 1407 while (decCursor != decEnd) { 1408 if (*decCursor == '\'') { 1409 break; 1410 } else { 1411 decCursor++; 1412 } 1413 } 1414 if (decCursor == decEnd) { 1415 KMIME_WARN << "No language in extended-initial-value." 1416 "Trying to recover."; 1417 decCursor = oldDecCursor; 1418 } else { 1419 decCursor++; 1420 } 1421 1422 // decCursor now points to the start of the 1423 // "extended-other-values": 1424 1425 // 1426 // get the decoders: 1427 // 1428 1429 textcodec = QStringDecoder(charset.constData()); 1430 if (!textcodec.isValid()) { 1431 KMIME_WARN_UNKNOWN(Charset, charset); 1432 } 1433 } 1434 1435 if (!rfc2231Codec) { 1436 rfc2231Codec = KCodecs::Codec::codecForName("x-kmime-rfc2231"); 1437 assert(rfc2231Codec); 1438 } 1439 1440 if (!textcodec.isValid()) { 1441 value += QString::fromLatin1(decCursor, decEnd - decCursor); 1442 return; 1443 } 1444 1445 KCodecs::Decoder *dec = rfc2231Codec->makeDecoder(); 1446 assert(dec); 1447 1448 // 1449 // do the decoding: 1450 // 1451 1452 QByteArray buffer; 1453 buffer.resize(rfc2231Codec->maxDecodedSizeFor(decEnd - decCursor)); 1454 QByteArray::Iterator bit = buffer.begin(); 1455 QByteArray::ConstIterator bend = buffer.end(); 1456 1457 if (!dec->decode(decCursor, decEnd, bit, bend)) { 1458 KMIME_WARN << rfc2231Codec->name() 1459 << "codec lies about its maxDecodedSizeFor()" 1460 << Qt::endl 1461 << "result may be truncated"; 1462 } 1463 1464 value += textcodec.decode(QByteArrayView(buffer.begin(), bit - buffer.begin())); 1465 1466 // qCDebug(KMIME_LOG) << "value now: \"" << value << "\""; 1467 // cleanup: 1468 delete dec; 1469 } 1470 1471 // known issues: 1472 // - permutes rfc2231 continuations when the total number of parts 1473 // exceeds 10 (other-sections then becomes *xy, ie. two digits) 1474 1475 bool parseParameterListWithCharset(const char *&scursor, 1476 const char *const send, 1477 QMap<QString, QString> &result, 1478 QByteArray &charset, bool isCRLF) 1479 { 1480 // parse the list into raw attribute-value pairs: 1481 QMap<QString, QStringOrQPair> rawParameterList; 1482 if (!parseRawParameterList(scursor, send, rawParameterList, isCRLF)) { 1483 return false; 1484 } 1485 1486 if (rawParameterList.isEmpty()) { 1487 return true; 1488 } 1489 1490 // decode rfc 2231 continuations and alternate charset encoding: 1491 1492 // NOTE: this code assumes that what QMapIterator delivers is sorted 1493 // by the key! 1494 1495 KCodecs::Codec *rfc2231Codec = nullptr; 1496 QStringDecoder textcodec; 1497 QString attribute; 1498 QString value; 1499 enum Mode { 1500 NoMode = 0x0, Continued = 0x1, Encoded = 0x2 1501 }; 1502 1503 enum EncodingMode { 1504 NoEncoding, 1505 RFC2047, 1506 RFC2231 1507 }; 1508 1509 QMap<QString, QStringOrQPair>::Iterator it; 1510 QMap<QString, QStringOrQPair>::Iterator end = rawParameterList.end(); 1511 1512 for (it = rawParameterList.begin() ; it != end ; ++it) { 1513 if (attribute.isNull() || !it.key().startsWith(attribute)) { 1514 // 1515 // new attribute: 1516 // 1517 1518 // store the last attribute/value pair in the result map now: 1519 if (!attribute.isNull()) { 1520 result.insert(attribute, value); 1521 } 1522 // and extract the information from the new raw attribute: 1523 value.clear(); 1524 attribute = it.key(); 1525 int mode = NoMode; 1526 EncodingMode encodingMode = NoEncoding; 1527 1528 // is the value rfc2331-encoded? 1529 if (attribute.endsWith(QLatin1Char('*'))) { 1530 attribute.chop(1); 1531 mode |= Encoded; 1532 encodingMode = RFC2231; 1533 } 1534 // is the value rfc2047-encoded? 1535 if (!(*it).qstring.isNull() && 1536 (*it).qstring.contains(QLatin1StringView("=?"))) { 1537 mode |= Encoded; 1538 encodingMode = RFC2047; 1539 } 1540 // is the value continued? 1541 if (attribute.endsWith(QLatin1StringView("*0"))) { 1542 attribute.chop(2); 1543 mode |= Continued; 1544 } 1545 // 1546 // decode if necessary: 1547 // 1548 if (mode & Encoded) { 1549 if (encodingMode == RFC2231) { 1550 decodeRFC2231Value(rfc2231Codec, textcodec, 1551 false, /* isn't continuation */ 1552 value, (*it).qpair, charset); 1553 } else if (encodingMode == RFC2047) { 1554 value += KCodecs::decodeRFC2047String((*it).qstring.toLatin1(), &charset); 1555 } 1556 } else { 1557 // not encoded. 1558 if ((*it).qpair.first) { 1559 value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second); 1560 } else { 1561 value += (*it).qstring; 1562 } 1563 } 1564 1565 // 1566 // shortcut-processing when the value isn't encoded: 1567 // 1568 1569 if (!(mode & Continued)) { 1570 // save result already: 1571 result.insert(attribute, value); 1572 // force begin of a new attribute: 1573 attribute.clear(); 1574 } 1575 } else { // it.key().startsWith( attribute ) 1576 // 1577 // continuation 1578 // 1579 1580 // ignore the section and trust QMap to have sorted the keys: 1581 if (it.key().endsWith(QLatin1Char('*'))) { 1582 // encoded 1583 decodeRFC2231Value(rfc2231Codec, textcodec, 1584 true, /* is continuation */ 1585 value, (*it).qpair, charset); 1586 } else { 1587 // not encoded 1588 if ((*it).qpair.first) { 1589 value += QString::fromLatin1((*it).qpair.first, (*it).qpair.second); 1590 } else { 1591 value += (*it).qstring; 1592 } 1593 } 1594 } 1595 } 1596 // write last attr/value pair: 1597 if (!attribute.isNull()) { 1598 result.insert(attribute, value); 1599 } 1600 1601 return true; 1602 } 1603 1604 bool parseParameterList(const char *&scursor, const char *const send, 1605 QMap<QString, QString> &result, bool isCRLF) 1606 { 1607 QByteArray charset; 1608 return parseParameterListWithCharset(scursor, send, result, charset, isCRLF); 1609 } 1610 1611 static const char stdDayNames[][4] = { 1612 "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" 1613 }; 1614 static const int stdDayNamesLen = sizeof stdDayNames / sizeof *stdDayNames; 1615 1616 static bool parseDayName(const char *&scursor, const char *const send) 1617 { 1618 // check bounds: 1619 if (send - scursor < 3) { 1620 return false; 1621 } 1622 1623 for (int i = 0 ; i < stdDayNamesLen ; ++i) { 1624 if (qstrnicmp(scursor, stdDayNames[i], 3) == 0) { 1625 scursor += 3; 1626 // qCDebug(KMIME_LOG) << "found" << stdDayNames[i]; 1627 return true; 1628 } 1629 } 1630 1631 return false; 1632 } 1633 1634 static const char stdMonthNames[][4] = { 1635 "Jan", "Feb", "Mar", "Apr", "May", "Jun", 1636 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" 1637 }; 1638 static const int stdMonthNamesLen = 1639 sizeof stdMonthNames / sizeof *stdMonthNames; 1640 1641 static bool parseMonthName(const char *&scursor, const char *const send, 1642 int &result) 1643 { 1644 // check bounds: 1645 if (send - scursor < 3) { 1646 return false; 1647 } 1648 1649 for (result = 0 ; result < stdMonthNamesLen ; ++result) { 1650 if (qstrnicmp(scursor, stdMonthNames[result], 3) == 0) { 1651 scursor += 3; 1652 return true; 1653 } 1654 } 1655 1656 // not found: 1657 return false; 1658 } 1659 1660 static const struct { 1661 const char tzName[5]; 1662 long int secsEastOfGMT; 1663 } timeZones[] = { 1664 // rfc 822 timezones: 1665 { "GMT", 0 }, 1666 { "UT", 0 }, 1667 { "EDT", -4 * 3600 }, 1668 { "EST", -5 * 3600 }, 1669 { "MST", -5 * 3600 }, 1670 { "CST", -6 * 3600 }, 1671 { "MDT", -6 * 3600 }, 1672 { "MST", -7 * 3600 }, 1673 { "PDT", -7 * 3600 }, 1674 { "PST", -8 * 3600 }, 1675 // common, non-rfc-822 zones: 1676 { "CET", 1 * 3600 }, 1677 { "MET", 1 * 3600 }, 1678 { "UTC", 0 }, 1679 { "CEST", 2 * 3600 }, 1680 { "BST", 1 * 3600 }, 1681 // rfc 822 military timezones: 1682 { "Z", 0 }, 1683 { "A", -1 * 3600 }, 1684 { "B", -2 * 3600 }, 1685 { "C", -3 * 3600 }, 1686 { "D", -4 * 3600 }, 1687 { "E", -5 * 3600 }, 1688 { "F", -6 * 3600 }, 1689 { "G", -7 * 3600 }, 1690 { "H", -8 * 3600 }, 1691 { "I", -9 * 3600 }, 1692 // J is not used! 1693 { "K", -10 * 3600 }, 1694 { "L", -11 * 3600 }, 1695 { "M", -12 * 3600 }, 1696 { "N", 1 * 3600 }, 1697 { "O", 2 * 3600 }, 1698 { "P", 3 * 3600 }, 1699 { "Q", 4 * 3600 }, 1700 { "R", 5 * 3600 }, 1701 { "S", 6 * 3600 }, 1702 { "T", 7 * 3600 }, 1703 { "U", 8 * 3600 }, 1704 { "V", 9 * 3600 }, 1705 { "W", 10 * 3600 }, 1706 { "X", 11 * 3600 }, 1707 { "Y", 12 * 3600 }, 1708 }; 1709 static const int timeZonesLen = sizeof timeZones / sizeof *timeZones; 1710 1711 static bool parseAlphaNumericTimeZone(const char *&scursor, 1712 const char *const send, 1713 long int &secsEastOfGMT, 1714 bool &timeZoneKnown) 1715 { 1716 // allow the timezone to be wrapped in quotes; bug 260761 1717 if (scursor < send && *scursor == '"') { 1718 scursor++; 1719 1720 if (scursor == send) { 1721 return false; 1722 } 1723 } 1724 1725 QPair<const char *, int> maybeTimeZone(nullptr, 0); 1726 if (!parseToken(scursor, send, maybeTimeZone, ParseTokenNoFlag)) { 1727 return false; 1728 } 1729 for (int i = 0 ; i < timeZonesLen ; ++i) { 1730 if (qstrnicmp(timeZones[i].tzName, 1731 maybeTimeZone.first, maybeTimeZone.second) == 0) { 1732 scursor += maybeTimeZone.second; 1733 secsEastOfGMT = timeZones[i].secsEastOfGMT; 1734 timeZoneKnown = true; 1735 1736 if (scursor < send && *scursor == '"') { 1737 scursor++; 1738 } 1739 1740 return true; 1741 } 1742 } 1743 1744 // don't choke just because we don't happen to know the time zone 1745 KMIME_WARN_UNKNOWN(time zone, 1746 QByteArray(maybeTimeZone.first, maybeTimeZone.second)); 1747 secsEastOfGMT = 0; 1748 timeZoneKnown = false; 1749 return true; 1750 } 1751 1752 // parse a number and return the number of digits parsed: 1753 int parseDigits(const char *&scursor, const char *const send, int &result) 1754 { 1755 result = 0; 1756 int digits = 0; 1757 for (; scursor != send && isdigit(*scursor) ; scursor++, digits++) { 1758 result *= 10; 1759 result += int(*scursor - '0'); 1760 } 1761 return digits; 1762 } 1763 1764 static bool parseTimeOfDay(const char *&scursor, const char *const send, 1765 int &hour, int &min, int &sec, bool isCRLF = false) 1766 { 1767 // time-of-day := 2DIGIT [CFWS] ":" [CFWS] 2DIGIT [ [CFWS] ":" 2DIGIT ] 1768 1769 // 1770 // 2DIGIT representing "hour": 1771 // 1772 if (!parseDigits(scursor, send, hour)) { 1773 return false; 1774 } 1775 1776 eatCFWS(scursor, send, isCRLF); 1777 if (scursor == send || *scursor != ':') { 1778 return false; 1779 } 1780 scursor++; // eat ':' 1781 1782 eatCFWS(scursor, send, isCRLF); 1783 if (scursor == send) { 1784 return false; 1785 } 1786 1787 // 1788 // 2DIGIT representing "minute": 1789 // 1790 if (!parseDigits(scursor, send, min)) { 1791 return false; 1792 } 1793 1794 eatCFWS(scursor, send, isCRLF); 1795 if (scursor == send) { 1796 return true; // seconds are optional 1797 } 1798 1799 // 1800 // let's see if we have a 2DIGIT representing "second": 1801 // 1802 if (*scursor == ':') { 1803 // yepp, there are seconds: 1804 scursor++; // eat ':' 1805 eatCFWS(scursor, send, isCRLF); 1806 if (scursor == send) { 1807 return false; 1808 } 1809 1810 if (!parseDigits(scursor, send, sec)) { 1811 return false; 1812 } 1813 } else { 1814 sec = 0; 1815 } 1816 1817 return true; 1818 } 1819 1820 bool parseTime(const char *&scursor, const char *send, 1821 int &hour, int &min, int &sec, long int &secsEastOfGMT, 1822 bool &timeZoneKnown, bool isCRLF) 1823 { 1824 // time := time-of-day CFWS ( zone / obs-zone ) 1825 // 1826 // obs-zone := "UT" / "GMT" / 1827 // "EST" / "EDT" / ; -0500 / -0400 1828 // "CST" / "CDT" / ; -0600 / -0500 1829 // "MST" / "MDT" / ; -0700 / -0600 1830 // "PST" / "PDT" / ; -0800 / -0700 1831 // "A"-"I" / "a"-"i" / 1832 // "K"-"Z" / "k"-"z" 1833 1834 eatCFWS(scursor, send, isCRLF); 1835 if (scursor == send) { 1836 return false; 1837 } 1838 1839 if (!parseTimeOfDay(scursor, send, hour, min, sec, isCRLF)) { 1840 return false; 1841 } 1842 1843 eatCFWS(scursor, send, isCRLF); 1844 // there might be no timezone but a year following 1845 if ((scursor == send) || isdigit(*scursor)) { 1846 timeZoneKnown = false; 1847 secsEastOfGMT = 0; 1848 return true; // allow missing timezone 1849 } 1850 1851 timeZoneKnown = true; 1852 if (*scursor == '+' || *scursor == '-') { 1853 // remember and eat '-'/'+': 1854 const char sign = *scursor++; 1855 // numerical timezone: 1856 int maybeTimeZone; 1857 const int tzDigits = parseDigits(scursor, send, maybeTimeZone); 1858 if (tzDigits != 4) { 1859 // Allow timezones in 02:00 format 1860 if (tzDigits == 2 && scursor != send && *scursor == ':') { 1861 scursor++; 1862 int maybeTimeZone2; 1863 if (parseDigits(scursor, send, maybeTimeZone2) != 2) { 1864 return false; 1865 } 1866 maybeTimeZone = maybeTimeZone * 100 + maybeTimeZone2; 1867 } else { 1868 return false; 1869 } 1870 } 1871 secsEastOfGMT = 60 * (maybeTimeZone / 100 * 60 + maybeTimeZone % 100); 1872 if (sign == '-') { 1873 secsEastOfGMT *= -1; 1874 if (secsEastOfGMT == 0) { 1875 timeZoneKnown = false; // -0000 means indetermined tz 1876 } 1877 } 1878 } else { 1879 // maybe alphanumeric timezone: 1880 if (!parseAlphaNumericTimeZone(scursor, send, secsEastOfGMT, timeZoneKnown)) { 1881 return false; 1882 } 1883 } 1884 return true; 1885 } 1886 1887 bool parseQDateTime(const char *&scursor, const char *const send, 1888 QDateTime &result, bool isCRLF) 1889 { 1890 eatCFWS(scursor, send, isCRLF); 1891 if (scursor == send) { 1892 return false; 1893 } 1894 // In qt6 yy == 1900 ! => for sure we use 2000 here. 1895 result = QDateTime::fromString(QString::fromLatin1(scursor, 17), QStringLiteral("dd/MM/yy HH:mm:ss")); 1896 QDate resultDate = result.date(); 1897 resultDate.setDate(resultDate.year() + 100, resultDate.month(), resultDate.day()); 1898 result.setDate(resultDate); 1899 return result.isValid(); 1900 } 1901 1902 bool parseDateTime(const char *&scursor, const char *const send, 1903 QDateTime &result, bool isCRLF) 1904 { 1905 // Parsing date-time; strict mode: 1906 // 1907 // date-time := [ [CFWS] day-name [CFWS] "," ] ; wday 1908 // (expanded) [CFWS] 1*2DIGIT CFWS month-name CFWS 2*DIGIT [CFWS] ; date 1909 // time 1910 // 1911 // day-name := "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun" 1912 // month-name := "Jan" / "Feb" / "Mar" / "Apr" / "May" / "Jun" / 1913 // "Jul" / "Aug" / "Sep" / "Oct" / "Nov" / "Dec" 1914 1915 result = QDateTime(); 1916 1917 eatCFWS(scursor, send, isCRLF); 1918 if (scursor == send) { 1919 return false; 1920 } 1921 1922 // 1923 // let's see if there's a day-of-week: 1924 // 1925 if (parseDayName(scursor, send)) { 1926 eatCFWS(scursor, send, isCRLF); 1927 if (scursor == send) { 1928 return false; 1929 } 1930 // day-name should be followed by ',' but we treat it as optional: 1931 if (*scursor == ',') { 1932 scursor++; // eat ',' 1933 eatCFWS(scursor, send, isCRLF); 1934 } 1935 } 1936 1937 int maybeMonth = -1; 1938 bool asctimeFormat = false; 1939 1940 // ANSI-C asctime() format is: Wed Jun 30 21:49:08 1993 1941 if (!isdigit(*scursor) && parseMonthName(scursor, send, maybeMonth)) { 1942 asctimeFormat = true; 1943 eatCFWS(scursor, send, isCRLF); 1944 } 1945 1946 // 1947 // 1*2DIGIT representing "day" (of month): 1948 // 1949 int maybeDay; 1950 if (!parseDigits(scursor, send, maybeDay)) { 1951 return false; 1952 } 1953 1954 eatCFWS(scursor, send, isCRLF); 1955 if (scursor == send) { 1956 return false; 1957 } 1958 1959 // ignore ","; bug 54098 1960 if (*scursor == ',') { 1961 scursor++; 1962 } 1963 1964 // 1965 // month-name: 1966 // 1967 if (!asctimeFormat && !parseMonthName(scursor, send, maybeMonth)) { 1968 return false; 1969 } 1970 if (scursor == send) { 1971 return false; 1972 } 1973 assert(maybeMonth >= 0); assert(maybeMonth <= 11); 1974 ++maybeMonth; // 0-11 -> 1-12 1975 1976 eatCFWS(scursor, send, isCRLF); 1977 if (scursor == send) { 1978 return false; 1979 } 1980 1981 // check for "year HH:MM:SS" or only "HH:MM:SS" (or "H:MM:SS") 1982 bool timeAfterYear = true; 1983 if ((send - scursor > 3) && ((scursor[1] == ':') || (scursor[2] == ':'))) { 1984 timeAfterYear = false; // first read time, then year 1985 } 1986 1987 // 1988 // 2*DIGIT representing "year": 1989 // 1990 int maybeYear = 0; 1991 1992 if (timeAfterYear && !parseDigits(scursor, send, maybeYear)) { 1993 return false; 1994 } 1995 1996 eatCFWS(scursor, send, isCRLF); 1997 int maybeHour; 1998 int maybeMinute; 1999 int maybeSecond; 2000 long int secsEastOfGMT = 0; 2001 QDate maybeDate; 2002 QTime maybeTime; 2003 if (scursor != send) { 2004 // 2005 // time 2006 // 2007 bool timeZoneKnown = true; 2008 2009 if (!parseTime(scursor, send, 2010 maybeHour, maybeMinute, maybeSecond, 2011 secsEastOfGMT, timeZoneKnown, isCRLF)) { 2012 return false; 2013 } 2014 2015 // in asctime() the year follows the time 2016 if (!timeAfterYear) { 2017 eatCFWS(scursor, send, isCRLF); 2018 if (scursor == send) { 2019 return false; 2020 } 2021 2022 if (!parseDigits(scursor, send, maybeYear)) { 2023 return false; 2024 } 2025 } 2026 2027 // RFC 2822 4.3 processing: 2028 if (maybeYear < 50) { 2029 maybeYear += 2000; 2030 } else if (maybeYear < 1000) { 2031 maybeYear += 1900; 2032 } 2033 // else keep as is 2034 if (maybeYear < 1900) { 2035 return false; // rfc2822, 3.3 2036 } 2037 2038 maybeDate = QDate(maybeYear, maybeMonth, maybeDay); 2039 maybeTime = QTime(maybeHour, maybeMinute, maybeSecond); 2040 2041 if (!maybeDate.isValid() || !maybeTime.isValid()) { 2042 return false; 2043 } 2044 } else { 2045 maybeDate = QDate(maybeYear, maybeMonth, maybeDay); 2046 maybeTime = QTime(0, 0, 0); 2047 } 2048 2049 result = QDateTime(maybeDate, maybeTime, QTimeZone::fromSecondsAheadOfUtc(secsEastOfGMT)); 2050 if (!result.isValid()) { 2051 return false; 2052 } 2053 return true; 2054 } 2055 2056 namespace { 2057 2058 Headers::Base *extractHeader(QByteArrayView head, const int headerStart, int &endOfFieldBody) 2059 { 2060 Headers::Base *header = {}; 2061 2062 int startOfFieldBody = head.indexOf(':', headerStart); 2063 if (startOfFieldBody < 0) { 2064 return nullptr; 2065 } 2066 2067 const char *rawType = head.constData() + headerStart; 2068 const size_t rawTypeLen = startOfFieldBody - headerStart; 2069 2070 startOfFieldBody++; //skip the ':' 2071 if (startOfFieldBody < head.size() - 1 && head[startOfFieldBody] == ' ') { // skip the space after the ':', if there's any 2072 startOfFieldBody++; 2073 } 2074 2075 bool folded = false; 2076 endOfFieldBody = findHeaderLineEnd(head, startOfFieldBody, &folded); 2077 2078 // We might get an invalid mail without a field name, don't crash on that. 2079 if (rawTypeLen > 0) { 2080 header = HeaderFactory::createHeader(rawType, rawTypeLen); 2081 } 2082 if (!header) { 2083 //qCWarning(KMIME_LOG)() << "Returning Generic header of type" << rawType; 2084 header = new Headers::Generic(rawType, rawTypeLen); 2085 } 2086 if (folded) { 2087 const auto unfoldedBody = unfoldHeader(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody); 2088 header->from7BitString(unfoldedBody); 2089 } else { 2090 header->from7BitString(head.constData() + startOfFieldBody, endOfFieldBody - startOfFieldBody); 2091 } 2092 2093 return header; 2094 } 2095 2096 } 2097 2098 std::unique_ptr<KMime::Headers::Base> parseNextHeader(QByteArrayView &head) 2099 { 2100 int endOfFieldBody = 0; 2101 std::unique_ptr<KMime::Headers::Base> header(extractHeader(head, 0, endOfFieldBody)); 2102 if (header) { 2103 head = head.mid(endOfFieldBody + 1); 2104 } else { 2105 head = {}; 2106 } 2107 2108 return header; 2109 } 2110 2111 void extractHeaderAndBody(const QByteArray &content, QByteArray &header, QByteArray &body) 2112 { 2113 header.clear(); 2114 body.clear(); 2115 2116 // empty header 2117 if (content.startsWith('\n')) { 2118 body = content.right(content.length() - 1); 2119 return; 2120 } 2121 2122 int pos = content.indexOf("\n\n", 0); 2123 if (pos > -1) { 2124 header = content.left(++pos); //header *must* end with "\n" !! 2125 body = content.mid(pos + 1); 2126 if (body.startsWith("\n")) { 2127 body = "\n" + body; 2128 } 2129 } else { 2130 header = content; 2131 } 2132 } 2133 2134 QList<Headers::Base *> parseHeaders(const QByteArray &head) { 2135 QList<Headers::Base *> ret; 2136 2137 int cursor = 0; 2138 while (cursor < head.size()) { 2139 const int headerStart = cursor; 2140 int endOfFieldBody; 2141 if (auto header = extractHeader(head, headerStart, endOfFieldBody)) { 2142 ret << header; 2143 cursor = endOfFieldBody + 1; 2144 } else { 2145 break; 2146 } 2147 } 2148 2149 return ret; 2150 } 2151 2152 } // namespace HeaderParsing 2153 2154 } // namespace KMime