File indexing completed on 2024-05-12 05:26:00

0001 /*
0002  *   Copyright (C) 2015 Christian Mollekopf <chrigi_1@fastmail.fm>
0003  *
0004  *   This program is free software; you can redistribute it and/or modify
0005  *   it under the terms of the GNU General Public License as published by
0006  *   the Free Software Foundation; either version 2 of the License, or
0007  *   (at your option) any later version.
0008  *
0009  *   This program is distributed in the hope that it will be useful,
0010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
0011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0012  *   GNU General Public License for more details.
0013  *
0014  *   You should have received a copy of the GNU General Public License
0015  *   along with this program; if not, write to the
0016  *   Free Software Foundation, Inc.,
0017  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
0018  */
0019 
0020 #include "mailpreprocessor.h"
0021 
0022 #include <QFile>
0023 #include <QDir>
0024 #include <QTextDocument>
0025 #include <QGuiApplication>
0026 #include <QUuid>
0027 #include <KMime/KMimeMessage>
0028 #include <KMime/Headers>
0029 #include <mime/mimetreeparser/objecttreeparser.h>
0030 
0031 #include "pipeline.h"
0032 #include "definitions.h"
0033 #include "applicationdomaintype.h"
0034 
0035 using namespace Sink;
0036 
0037 static QString getString(const KMime::Headers::Base *header, const QString &defaultValue = {})
0038 {
0039     if (!header) {
0040         return defaultValue;
0041     }
0042     return header->asUnicodeString() ;
0043 }
0044 
0045 static QDateTime getDate(const KMime::Headers::Base *header)
0046 {
0047     if (!header) {
0048         return QDateTime::currentDateTimeUtc();
0049     }
0050     return static_cast<const KMime::Headers::Date*>(header)->dateTime();
0051 }
0052 
0053 static Sink::ApplicationDomain::Mail::Contact fromMailbox(const KMime::Types::Mailbox &mb)
0054 {
0055     return Sink::ApplicationDomain::Mail::Contact{mb.name(), mb.address()};
0056 }
0057 
0058 static Sink::ApplicationDomain::Mail::Contact getContact(const KMime::Headers::Base *h)
0059 {
0060     if (!h) {
0061         return {};
0062     }
0063     const auto header = static_cast<const KMime::Headers::Generics::MailboxList*>(h);
0064     const auto mb = header->mailboxes().isEmpty() ? KMime::Types::Mailbox{} : header->mailboxes().first();
0065     return fromMailbox(mb);
0066 }
0067 
0068 static QList<Sink::ApplicationDomain::Mail::Contact> getContactList(const KMime::Headers::Base *h)
0069 {
0070     if (!h) {
0071         return {};
0072     }
0073     const auto header = static_cast<const KMime::Headers::Generics::AddressList*>(h);
0074     QList<Sink::ApplicationDomain::Mail::Contact> list;
0075     for (const auto &mb : header->mailboxes()) {
0076         list << fromMailbox(mb);
0077     }
0078     return list;
0079 }
0080 
0081 static QVector<QByteArray> getIdentifiers(const KMime::Headers::Base *h)
0082 {
0083     if (!h) {
0084         return {};
0085     }
0086     return static_cast<const KMime::Headers::Generics::Ident*>(h)->identifiers();
0087 }
0088 
0089 static QByteArray getIdentifier(const KMime::Headers::Base *h)
0090 {
0091     if (!h) {
0092         return {};
0093     }
0094     return static_cast<const KMime::Headers::Generics::SingleIdent*>(h)->identifier();
0095 }
0096 
0097 static QByteArray normalizeMessageId(const QByteArray &id)
0098 {
0099     return id;
0100 }
0101 
0102 static QString toPlain(const QString &html)
0103 {
0104     //QTextDocument has an implicit runtime dependency on QGuiApplication via the color palette.
0105     //If the QGuiApplication is not available we will crash (if the html contains colors).
0106     Q_ASSERT(QGuiApplication::instance());
0107     // Only get HTML content, if no plain text content
0108     QTextDocument doc;
0109     doc.setHtml(html);
0110     return doc.toPlainText();
0111 }
0112 
0113 void MailPropertyExtractor::updatedIndexedProperties(Sink::ApplicationDomain::Mail &mail, const QByteArray &data)
0114 {
0115     if (data.isEmpty()) {
0116         //Always set a dummy subject and date, so we can find the message
0117         //In test we sometimes pre-set the extracted date though, so we check that first.
0118         if (mail.getSubject().isEmpty()) {
0119             mail.setExtractedSubject("Error: Empty message");
0120         }
0121         if (!mail.getDate().isValid()) {
0122             mail.setExtractedDate(QDateTime::currentDateTimeUtc());
0123         }
0124         return;
0125     }
0126     MimeTreeParser::ObjectTreeParser otp;
0127     otp.parseObjectTree(data);
0128     otp.decryptAndVerify();
0129 
0130     const auto partList = otp.collectContentParts();
0131     const auto part = [&] () -> MimeTreeParser::MessagePartPtr {
0132         if (!partList.isEmpty()) {
0133             return partList[0];
0134         }
0135         //Extract headers also if there are only attachment parts.
0136         return  otp.parsedPart();
0137     }();
0138     Q_ASSERT(part);
0139 
0140     mail.setExtractedSubject(getString(part->header(KMime::Headers::Subject::staticType()), "Error: No subject"));
0141     mail.setExtractedSender(getContact(part->header(KMime::Headers::From::staticType())));
0142     mail.setExtractedTo(getContactList(part->header(KMime::Headers::To::staticType())));
0143     mail.setExtractedCc(getContactList(part->header(KMime::Headers::Cc::staticType())));
0144     mail.setExtractedBcc(getContactList(part->header(KMime::Headers::Bcc::staticType())));
0145     mail.setExtractedDate(getDate(part->header(KMime::Headers::Date::staticType())));
0146 
0147     const auto parentMessageIds = [&] {
0148         //The last is the parent
0149         const auto references = getIdentifiers(part->header(KMime::Headers::References::staticType()));
0150 
0151         if (!references.isEmpty()) {
0152             QByteArrayList list;
0153             std::transform(references.constBegin(), references.constEnd(), std::back_inserter(list), [] (const QByteArray &id) { return normalizeMessageId(id); });
0154             return list;
0155         } else {
0156             const auto inReplyTo = getIdentifiers(part->header(KMime::Headers::InReplyTo::staticType()));
0157             if (!inReplyTo.isEmpty()) {
0158                 //According to RFC5256 we should ignore all but the first
0159                 return QByteArrayList{normalizeMessageId(inReplyTo.first())};
0160             }
0161         }
0162         return QByteArrayList{};
0163     }();
0164 
0165     //The rest should never change, unless we didn't have the headers available initially.
0166     auto messageId = normalizeMessageId(getIdentifier(part->header(KMime::Headers::MessageID::staticType())));
0167     if (messageId.isEmpty()) {
0168         //reuse an existing messageid (on modification)
0169         const auto existing = mail.getMessageId();
0170         if (existing.isEmpty()) {
0171             auto tmp = KMime::Message::Ptr::create();
0172             //Genereate a globally unique messageid that doesn't leak the local hostname
0173             messageId = QString{"<" + QUuid::createUuid().toString().mid(1, 36).remove('-') + "@sink>"}.toLatin1();
0174             tmp->messageID(true)->fromUnicodeString(messageId, "utf-8");
0175             SinkWarning() << "Message id is empty, generating one: " << messageId;
0176         } else {
0177             messageId = existing;
0178         }
0179     }
0180 
0181     mail.setExtractedMessageId(messageId);
0182     if (!parentMessageIds.isEmpty()) {
0183         mail.setExtractedParentMessageIds(parentMessageIds);
0184     }
0185     QList<QPair<QString, QString>> contentToIndex;
0186     const auto subject = getString(part->header(KMime::Headers::Subject::staticType()));
0187     contentToIndex.append({{"subject"}, subject});
0188 
0189     const auto plainTextContent = otp.plainTextContent();
0190     if (plainTextContent.isEmpty()) {
0191         contentToIndex.append({{}, toPlain(otp.htmlContent())});
0192     } else {
0193         contentToIndex.append({{}, plainTextContent});
0194     }
0195 
0196     const auto sender = mail.getSender();
0197     contentToIndex.append({{"sender"}, sender.name});
0198     contentToIndex.append({{"sender"}, sender.emailAddress});
0199     for (const auto &c : mail.getTo()) {
0200         contentToIndex.append({{"recipients"}, c.name});
0201         contentToIndex.append({{"recipients"}, c.emailAddress});
0202     }
0203     for (const auto &c : mail.getCc()) {
0204         contentToIndex.append({{"recipients"}, c.name});
0205         contentToIndex.append({{"recipients"}, c.emailAddress});
0206     }
0207     for (const auto &c : mail.getBcc()) {
0208         contentToIndex.append({{"recipients"}, c.name});
0209         contentToIndex.append({{"recipients"}, c.emailAddress});
0210     }
0211 
0212     //Prepare content for indexing;
0213     mail.setProperty("index", QVariant::fromValue(contentToIndex));
0214     mail.setProperty("indexDate", QVariant::fromValue(mail.getDate()));
0215 }
0216 
0217 void MailPropertyExtractor::newEntity(Sink::ApplicationDomain::Mail &mail)
0218 {
0219     updatedIndexedProperties(mail, mail.getMimeMessage());
0220 }
0221 
0222 void MailPropertyExtractor::modifiedEntity(const Sink::ApplicationDomain::Mail &oldMail, Sink::ApplicationDomain::Mail &newMail)
0223 {
0224     updatedIndexedProperties(newMail, newMail.getMimeMessage());
0225 }