File indexing completed on 2024-05-12 05:28:19
0001 // This file is part of KMail, the KDE mail client. 0002 // SPDX-FileCopyrightText: 2003 Marc Mutz <mutz@kde.org> 0003 // SPDX-FileCopyrightText: 2002-2004 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.net 0004 // SPDX-FileCopyrightText: 2009 Andras Mantia <andras@kdab.net> 0005 // SPDX-FileCopyrightText: 2015 Sandro Knauß <sknauss@kde.org> 0006 // SPDX-FileCopyrightText: 2017 Christian Mollekopf <mollekopf@kolabsystems.com> 0007 // SPDX-License-Identifier: GPL-2.0-or-later 0008 0009 #include "objecttreeparser.h" 0010 0011 #include "bodypartformatterbasefactory.h" 0012 #include "messagepart.h" 0013 0014 #include "mimetreeparser_debug.h" 0015 0016 #include "bodypartformatter.h" 0017 #include "utils.h" 0018 0019 #include <KMime/Message> 0020 0021 #include <KCharsets> 0022 #include <QByteArray> 0023 #include <QMimeDatabase> 0024 #include <QRegularExpression> 0025 #include <QTextCodec> 0026 #include <QTextStream> 0027 #include <QUrl> 0028 0029 using namespace MimeTreeParser; 0030 0031 /* 0032 * Collect message parts bottom up. 0033 * Filter to avoid evaluating a subtree. 0034 * Select parts to include it in the result set. Selecting a part in a branch will keep any parent parts from being selected. 0035 */ 0036 static QVector<MessagePart::Ptr> 0037 collect(MessagePart::Ptr start, const std::function<bool(const MessagePartPtr &)> &evaluateSubtree, const std::function<bool(const MessagePartPtr &)> &select) 0038 { 0039 MessagePartPtr ptr = start.dynamicCast<MessagePart>(); 0040 Q_ASSERT(ptr); 0041 QVector<MessagePart::Ptr> list; 0042 if (evaluateSubtree(ptr)) { 0043 for (const auto &p : ptr->subParts()) { 0044 list << ::collect(p, evaluateSubtree, select); 0045 } 0046 } 0047 0048 // Don't consider this part if we already selected a subpart 0049 if (list.isEmpty()) { 0050 if (select(ptr)) { 0051 list << start; 0052 } 0053 } 0054 return list; 0055 } 0056 0057 QString ObjectTreeParser::plainTextContent() 0058 { 0059 QString content; 0060 if (mParsedPart) { 0061 auto plainParts = ::collect( 0062 mParsedPart, 0063 [](const MessagePartPtr &) { 0064 return true; 0065 }, 0066 [](const MessagePartPtr &part) { 0067 if (part->isAttachment()) { 0068 return false; 0069 } 0070 if (dynamic_cast<MimeTreeParser::TextMessagePart *>(part.data())) { 0071 return true; 0072 } 0073 if (dynamic_cast<MimeTreeParser::AlternativeMessagePart *>(part.data())) { 0074 return true; 0075 } 0076 return false; 0077 }); 0078 for (const auto &part : plainParts) { 0079 content += part->text(); 0080 } 0081 } 0082 return content; 0083 } 0084 0085 QString ObjectTreeParser::htmlContent() 0086 { 0087 QString content; 0088 if (mParsedPart) { 0089 QVector<MessagePart::Ptr> contentParts = ::collect( 0090 mParsedPart, 0091 [](const MessagePartPtr &) { 0092 return true; 0093 }, 0094 [](const MessagePartPtr &part) { 0095 if (dynamic_cast<MimeTreeParser::HtmlMessagePart *>(part.data())) { 0096 return true; 0097 } 0098 if (dynamic_cast<MimeTreeParser::AlternativeMessagePart *>(part.data())) { 0099 return true; 0100 } 0101 return false; 0102 }); 0103 for (const auto &part : contentParts) { 0104 if (auto p = dynamic_cast<MimeTreeParser::AlternativeMessagePart *>(part.data())) { 0105 content += p->htmlContent(); 0106 } else { 0107 content += part->text(); 0108 } 0109 } 0110 } 0111 return content; 0112 } 0113 0114 static void print(QTextStream &stream, KMime::Content *node, const QString prefix = {}) 0115 { 0116 QByteArray mediaType("text"); 0117 QByteArray subType("plain"); 0118 if (node->contentType(false) && !node->contentType()->mediaType().isEmpty() && !node->contentType()->subType().isEmpty()) { 0119 mediaType = node->contentType()->mediaType(); 0120 subType = node->contentType()->subType(); 0121 } 0122 stream << prefix << "! " << mediaType << subType << " isAttachment: " << KMime::isAttachment(node) << "\n"; 0123 for (const auto nodeContent : node->contents()) { 0124 print(stream, nodeContent, prefix + QLatin1String(" ")); 0125 } 0126 } 0127 0128 static void print(QTextStream &stream, const MessagePart &messagePart, const QByteArray pre = {}) 0129 { 0130 stream << pre << "# " << messagePart.metaObject()->className() << " isAttachment: " << messagePart.isAttachment() << "\n"; 0131 for (const auto &subPart : messagePart.subParts()) { 0132 print(stream, *subPart, pre + " "); 0133 } 0134 } 0135 0136 QString ObjectTreeParser::structureAsString() const 0137 { 0138 QString string; 0139 QTextStream stream{&string}; 0140 0141 if (mTopLevelContent) { 0142 ::print(stream, mTopLevelContent); 0143 } 0144 if (mParsedPart) { 0145 ::print(stream, *mParsedPart); 0146 } 0147 return string; 0148 } 0149 0150 void ObjectTreeParser::print() 0151 { 0152 qInfo().noquote() << structureAsString(); 0153 } 0154 0155 static KMime::Content *find(KMime::Content *node, const std::function<bool(KMime::Content *)> &select) 0156 { 0157 QByteArray mediaType("text"); 0158 QByteArray subType("plain"); 0159 if (node->contentType(false) && !node->contentType()->mediaType().isEmpty() && !node->contentType()->subType().isEmpty()) { 0160 mediaType = node->contentType()->mediaType(); 0161 subType = node->contentType()->subType(); 0162 } 0163 if (select(node)) { 0164 return node; 0165 } 0166 for (const auto nodeContent : node->contents()) { 0167 if (const auto content = find(nodeContent, select)) { 0168 return content; 0169 } 0170 } 0171 return nullptr; 0172 } 0173 0174 KMime::Content *ObjectTreeParser::find(const std::function<bool(KMime::Content *)> &select) 0175 { 0176 return ::find(mTopLevelContent, select); 0177 } 0178 0179 QVector<MessagePartPtr> ObjectTreeParser::collectContentParts() 0180 { 0181 return collectContentParts(mParsedPart); 0182 } 0183 0184 QVector<MessagePart::Ptr> ObjectTreeParser::collectContentParts(MessagePart::Ptr start) 0185 { 0186 return ::collect( 0187 start, 0188 [start](const MessagePartPtr &part) { 0189 // Ignore the top-level 0190 if (start.data() == part.data()) { 0191 return true; 0192 } 0193 if (auto encapsulatedPart = part.dynamicCast<MimeTreeParser::EncapsulatedRfc822MessagePart>()) { 0194 return false; 0195 } 0196 return true; 0197 }, 0198 [start](const MessagePartPtr &part) { 0199 if (const auto attachment = dynamic_cast<MimeTreeParser::AttachmentMessagePart *>(part.data())) { 0200 return attachment->mimeType() == "text/calendar"; 0201 } else if (const auto text = dynamic_cast<MimeTreeParser::TextMessagePart *>(part.data())) { 0202 auto enc = dynamic_cast<MimeTreeParser::EncryptedMessagePart *>(text->parentPart()); 0203 if (enc && enc->error()) { 0204 return false; 0205 } 0206 return true; 0207 } else if (dynamic_cast<MimeTreeParser::AlternativeMessagePart *>(part.data())) { 0208 return true; 0209 } else if (dynamic_cast<MimeTreeParser::HtmlMessagePart *>(part.data())) { 0210 // Don't if we have an alternative part as parent 0211 return true; 0212 } else if (dynamic_cast<MimeTreeParser::EncapsulatedRfc822MessagePart *>(part.data())) { 0213 if (start.data() == part.data()) { 0214 return false; 0215 } 0216 return true; 0217 } else if (const auto enc = dynamic_cast<MimeTreeParser::EncryptedMessagePart *>(part.data())) { 0218 if (enc->error()) { 0219 return true; 0220 } 0221 // If we have a textpart with encrypted and unencrypted subparts we want to return the textpart 0222 if (dynamic_cast<MimeTreeParser::TextMessagePart *>(enc->parentPart())) { 0223 return false; 0224 } 0225 } else if (const auto sig = dynamic_cast<MimeTreeParser::SignedMessagePart *>(part.data())) { 0226 // Signatures without subparts already contain the text 0227 return !sig->hasSubParts(); 0228 } 0229 return false; 0230 }); 0231 } 0232 0233 QVector<MessagePart::Ptr> ObjectTreeParser::collectAttachmentParts() 0234 { 0235 QVector<MessagePart::Ptr> contentParts = ::collect( 0236 mParsedPart, 0237 [](const MessagePartPtr &) { 0238 return true; 0239 }, 0240 [](const MessagePartPtr &part) { 0241 return part->isAttachment(); 0242 }); 0243 return contentParts; 0244 } 0245 0246 void ObjectTreeParser::decryptParts() 0247 { 0248 decryptAndVerify(); 0249 } 0250 0251 /* 0252 * This naive implementation assumes that there is an encrypted part wrapping a signature. 0253 * For other cases we would have to process both recursively (I think?) 0254 */ 0255 void ObjectTreeParser::decryptAndVerify() 0256 { 0257 // We first decrypt 0258 ::collect( 0259 mParsedPart, 0260 [](const MessagePartPtr &) { 0261 return true; 0262 }, 0263 [](const MessagePartPtr &part) { 0264 if (const auto enc = dynamic_cast<MimeTreeParser::EncryptedMessagePart *>(part.data())) { 0265 enc->startDecryption(); 0266 } 0267 return false; 0268 }); 0269 // And then verify the available signatures 0270 ::collect( 0271 mParsedPart, 0272 [](const MessagePartPtr &) { 0273 return true; 0274 }, 0275 [](const MessagePartPtr &part) { 0276 if (const auto enc = dynamic_cast<MimeTreeParser::SignedMessagePart *>(part.data())) { 0277 enc->startVerification(); 0278 } 0279 return false; 0280 }); 0281 } 0282 0283 void ObjectTreeParser::importCertificates() 0284 { 0285 QVector<MessagePart::Ptr> contentParts = ::collect( 0286 mParsedPart, 0287 [](const MessagePartPtr &) { 0288 return true; 0289 }, 0290 [](const MessagePartPtr &part) { 0291 if (const auto cert = dynamic_cast<MimeTreeParser::CertMessagePart *>(part.data())) { 0292 cert->import(); 0293 } 0294 return false; 0295 }); 0296 } 0297 0298 QString ObjectTreeParser::resolveCidLinks(const QString &html) 0299 { 0300 auto text = html; 0301 static const auto regex = QRegularExpression(QLatin1String("(src)\\s*=\\s*(\"|')(cid:[^\"']+)\\2")); 0302 auto it = regex.globalMatch(text); 0303 while (it.hasNext()) { 0304 const auto match = it.next(); 0305 const auto link = QUrl(match.captured(3)); 0306 auto cid = link.path(); 0307 auto mailMime = const_cast<KMime::Content *>(find([=](KMime::Content *content) { 0308 if (!content || !content->contentID(false)) { 0309 return false; 0310 } 0311 return QString::fromLatin1(content->contentID(false)->identifier()) == cid; 0312 })); 0313 if (mailMime) { 0314 const auto contentType = mailMime->contentType(false); 0315 if (!contentType) { 0316 qWarning() << "No content type, skipping"; 0317 continue; 0318 } 0319 QMimeDatabase mimeDb; 0320 const auto mimetype = mimeDb.mimeTypeForName(QString::fromLatin1(contentType->mimeType())).name(); 0321 if (mimetype.startsWith(QLatin1String("image/"))) { 0322 // We reencode to base64 below. 0323 const auto data = mailMime->decodedContent(); 0324 if (data.isEmpty()) { 0325 qWarning() << "Attachment is empty."; 0326 continue; 0327 } 0328 text.replace(match.captured(0), QString::fromLatin1("src=\"data:%1;base64,%2\"").arg(mimetype, QString::fromLatin1(data.toBase64()))); 0329 } 0330 } else { 0331 qWarning() << "Failed to find referenced attachment: " << cid; 0332 } 0333 } 0334 return text; 0335 } 0336 0337 //----------------------------------------------------------------------------- 0338 0339 void ObjectTreeParser::parseObjectTree(const QByteArray &mimeMessage) 0340 { 0341 const auto mailData = KMime::CRLFtoLF(mimeMessage); 0342 mMsg = KMime::Message::Ptr(new KMime::Message); 0343 mMsg->setContent(mailData); 0344 mMsg->parse(); 0345 // We avoid using mMsg->contentType()->charset(), because that will just return kmime's defaultCharset(), ISO-8859-1 0346 const auto charset = mMsg->contentType()->parameter(QStringLiteral("charset")).toLatin1(); 0347 if (charset.isEmpty()) { 0348 mMsg->contentType()->setCharset("us-ascii"); 0349 } 0350 parseObjectTree(mMsg.data()); 0351 } 0352 0353 void ObjectTreeParser::parseObjectTree(KMime::Content *node) 0354 { 0355 mTopLevelContent = node; 0356 mParsedPart = parseObjectTreeInternal(node, false); 0357 } 0358 0359 MessagePartPtr ObjectTreeParser::parsedPart() const 0360 { 0361 return mParsedPart; 0362 } 0363 0364 /* 0365 * This will lookup suitable formatters based on the type, 0366 * and let them generate a list of parts. 0367 * If the formatter generated a list of parts, then those are taken, otherwise we move on to the next match. 0368 */ 0369 QVector<MessagePartPtr> ObjectTreeParser::processType(KMime::Content *node, const QByteArray &mediaType, const QByteArray &subType) 0370 { 0371 static MimeTreeParser::BodyPartFormatterBaseFactory factory; 0372 const auto sub = factory.subtypeRegistry(mediaType.constData()); 0373 const auto range = sub.equal_range(subType.constData()); 0374 for (auto it = range.first; it != range.second; ++it) { 0375 const auto formatter = it->second; 0376 if (!formatter) { 0377 continue; 0378 } 0379 const auto list = formatter->processList(this, node); 0380 if (!list.isEmpty()) { 0381 return list; 0382 } 0383 } 0384 return {}; 0385 } 0386 0387 MessagePart::Ptr ObjectTreeParser::parseObjectTreeInternal(KMime::Content *node, bool onlyOneMimePart) 0388 { 0389 if (!node) { 0390 return MessagePart::Ptr(); 0391 } 0392 0393 auto parsedPart = MessagePart::Ptr(new MessagePartList(this, node)); 0394 parsedPart->setIsRoot(node->isTopLevel()); 0395 const auto contents = node->parent() ? node->parent()->contents() : KMime::Content::List{node}; 0396 for (int i = contents.indexOf(node); i < contents.size(); ++i) { 0397 node = contents.at(i); 0398 0399 QByteArray mediaType("text"); 0400 QByteArray subType("plain"); 0401 if (node->contentType(false) && !node->contentType()->mediaType().isEmpty() && !node->contentType()->subType().isEmpty()) { 0402 mediaType = node->contentType()->mediaType(); 0403 subType = node->contentType()->subType(); 0404 } 0405 0406 auto messageParts = [&] { 0407 // Try the specific type handler 0408 { 0409 auto list = processType(node, mediaType, subType); 0410 if (!list.isEmpty()) { 0411 return list; 0412 } 0413 } 0414 // Fallback to the generic handler 0415 { 0416 auto list = processType(node, mediaType, "*"); 0417 if (!list.isEmpty()) { 0418 return list; 0419 } 0420 } 0421 // Fallback to the default handler 0422 return defaultHandling(node); 0423 }(); 0424 0425 for (const auto &part : messageParts) { 0426 parsedPart->appendSubPart(part); 0427 } 0428 0429 if (onlyOneMimePart) { 0430 break; 0431 } 0432 } 0433 0434 return parsedPart; 0435 } 0436 0437 QVector<MessagePart::Ptr> ObjectTreeParser::defaultHandling(KMime::Content *node) 0438 { 0439 if (node->contentType()->mimeType() == QByteArrayLiteral("application/octet-stream") 0440 && (node->contentType()->name().endsWith(QLatin1String("p7m")) || node->contentType()->name().endsWith(QLatin1String("p7s")) 0441 || node->contentType()->name().endsWith(QLatin1String("p7c")))) { 0442 auto list = processType(node, "application", "pkcs7-mime"); 0443 if (!list.isEmpty()) { 0444 return list; 0445 } 0446 } 0447 0448 return {AttachmentMessagePart::Ptr(new AttachmentMessagePart(this, node))}; 0449 } 0450 0451 static QTextCodec *getLocalCodec() 0452 { 0453 auto codec = QTextCodec::codecForLocale(); 0454 0455 // In the case of Japan. Japanese locale name is "eucjp" but 0456 // The Japanese mail systems normally used "iso-2022-jp" of locale name. 0457 // We want to change locale name from eucjp to iso-2022-jp at KMail only. 0458 0459 // (Introduction to i18n, 6.6 Limit of Locale technology): 0460 // EUC-JP is the de-facto standard for UNIX systems, ISO 2022-JP 0461 // is the standard for Internet, and Shift-JIS is the encoding 0462 // for Windows and Macintosh. 0463 if (codec) { 0464 const QByteArray codecNameLower = codec->name().toLower(); 0465 if (codecNameLower == "eucjp" 0466 #if defined Q_OS_WIN || defined Q_OS_MACX 0467 || codecNameLower == "shift-jis" // OK? 0468 #endif 0469 ) { 0470 codec = QTextCodec::codecForName("jis7"); 0471 // QTextCodec *cdc = QTextCodec::codecForName("jis7"); 0472 // QTextCodec::setCodecForLocale(cdc); 0473 // KLocale::global()->setEncoding(cdc->mibEnum()); 0474 } 0475 } 0476 return codec; 0477 } 0478 0479 const QTextCodec *ObjectTreeParser::codecFor(KMime::Content *node) const 0480 { 0481 static auto localCodec = getLocalCodec(); 0482 if (!node) { 0483 return localCodec; 0484 } 0485 0486 QByteArray charset = node->contentType()->charset().toLower(); 0487 0488 // utf-8 is a superset of us-ascii, so we don't lose anything if we use it instead 0489 // utf-8 is used so widely nowadays that it is a good idea to use it to fix issues with broken clients. 0490 if (charset == "us-ascii") { 0491 charset = "utf-8"; 0492 } 0493 if (!charset.isEmpty()) { 0494 if (auto c = KCharsets::charsets()->codecForName(QLatin1String(charset))) { 0495 return c; 0496 }; 0497 } 0498 // no charset means us-ascii (RFC 2045), so using local encoding should 0499 // be okay 0500 return localCodec; 0501 }