Warning, file /pim/akonadi-search/agent/emailindexer.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 /*
0002  * This file is part of the KDE Akonadi Search Project
0003  * SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
0004  *
0005  * SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
0006  *
0007  */
0008 
0009 #include "emailindexer.h"
0010 #include "akonadi_indexer_agent_email_debug.h"
0011 
0012 #include <Akonadi/Collection>
0013 #include <Akonadi/MessageFlags>
0014 #include <KEmailAddress>
0015 
0016 #include <QProcess>
0017 
0018 EmailIndexer::EmailIndexer(const QString &path, const QString &contactDbPath)
0019     : AbstractIndexer()
0020 {
0021     try {
0022         m_db = new Xapian::WritableDatabase(path.toStdString(), Xapian::DB_CREATE_OR_OPEN);
0023     } catch (const Xapian::DatabaseCorruptError &err) {
0024         qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "Database Corrupted - What did you do?";
0025         qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << err.get_error_string();
0026         m_db = nullptr;
0027     } catch (const Xapian::Error &e) {
0028         qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << QString::fromStdString(e.get_type()) << QString::fromStdString(e.get_description());
0029         m_db = nullptr;
0030     }
0031 
0032     try {
0033         m_contactDb = new Xapian::WritableDatabase(contactDbPath.toStdString(), Xapian::DB_CREATE_OR_OPEN);
0034     } catch (const Xapian::DatabaseCorruptError &err) {
0035         qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "Database Corrupted - What did you do?";
0036         qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << err.get_error_string();
0037         m_contactDb = nullptr;
0038     } catch (const Xapian::Error &e) {
0039         qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << QString::fromStdString(e.get_type()) << QString::fromStdString(e.get_description());
0040         m_contactDb = nullptr;
0041     }
0042 }
0043 
0044 EmailIndexer::~EmailIndexer()
0045 {
0046     commit();
0047     delete m_db;
0048     delete m_contactDb;
0049 }
0050 
0051 QStringList EmailIndexer::mimeTypes() const
0052 {
0053     return QStringList() << KMime::Message::mimeType();
0054 }
0055 
0056 void EmailIndexer::index(const Akonadi::Item &item)
0057 {
0058     qCDebug(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "Indexing item" << item.id();
0059     if (!m_db) {
0060         return;
0061     }
0062     Akonadi::MessageStatus status;
0063     status.setStatusFromFlags(item.flags());
0064     if (status.isSpam()) {
0065         return;
0066     }
0067 
0068     KMime::Message::Ptr msg;
0069     try {
0070         msg = item.payload<KMime::Message::Ptr>();
0071     } catch (const Akonadi::PayloadException &) {
0072         return;
0073     }
0074 
0075     m_doc = new Xapian::Document();
0076     m_termGen = new Xapian::TermGenerator();
0077     m_termGen->set_document(*m_doc);
0078     m_termGen->set_database(*m_db);
0079 
0080     processMessageStatus(status);
0081     process(msg);
0082 
0083     // Size
0084     m_doc->add_value(1, QString::number(item.size()).toStdString());
0085 
0086     // Parent collection
0087     Q_ASSERT_X(item.parentCollection().isValid(), "Akonadi::Search::EmailIndexer::index", "Item does not have a valid parent collection");
0088 
0089     const Akonadi::Collection::Id colId = item.parentCollection().id();
0090     const QByteArray term = 'C' + QByteArray::number(colId);
0091     m_doc->add_boolean_term(term.data());
0092 
0093     m_db->replace_document(item.id(), *m_doc);
0094 
0095     delete m_doc;
0096     delete m_termGen;
0097 
0098     m_doc = nullptr;
0099     m_termGen = nullptr;
0100     qCDebug(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "DONE Indexing item" << item.id();
0101 }
0102 
0103 void EmailIndexer::insert(const QByteArray &key, KMime::Headers::Base *unstructured)
0104 {
0105     if (unstructured) {
0106         m_termGen->index_text_without_positions(unstructured->asUnicodeString().toStdString(), 1, key.data());
0107     }
0108 }
0109 
0110 void EmailIndexer::insert(const QByteArray &key, KMime::Headers::Generics::MailboxList *mlist)
0111 {
0112     if (mlist) {
0113         insert(key, mlist->mailboxes());
0114     }
0115 }
0116 
0117 void EmailIndexer::insert(const QByteArray &key, KMime::Headers::Generics::AddressList *alist)
0118 {
0119     if (alist) {
0120         insert(key, alist->mailboxes());
0121     }
0122 }
0123 
0124 namespace
0125 {
0126 // Does some extra stuff such as lower casing the email, removing all quotes
0127 // and removing extra spaces
0128 // TODO: Move this into KMime?
0129 // TODO: If name is all upper/lower then try to captialize it?
0130 QString prettyAddress(const KMime::Types::Mailbox &mbox)
0131 {
0132     const QString name = mbox.name().simplified();
0133     const QByteArray email = mbox.address().simplified().toLower();
0134     return KEmailAddress::normalizedAddress(name, QString::fromUtf8(email));
0135 }
0136 }
0137 
0138 // Add once with a prefix and once without
0139 void EmailIndexer::insert(const QByteArray &key, const KMime::Types::Mailbox::List &list)
0140 {
0141     if (!m_contactDb) {
0142         return;
0143     }
0144     for (const KMime::Types::Mailbox &mbox : list) {
0145         const auto name(mbox.name().toStdString());
0146         m_termGen->index_text_without_positions(name, 1, key.data());
0147         m_termGen->index_text_without_positions(name, 1);
0148         m_termGen->index_text_without_positions(mbox.address().data(), 1, key.data());
0149         m_termGen->index_text_without_positions(mbox.address().data(), 1);
0150 
0151         m_doc->add_term(QByteArray(key + mbox.address()).data());
0152         m_doc->add_term(mbox.address().data());
0153 
0154         //
0155         // Add emails for email auto-completion
0156         //
0157         const auto pa = prettyAddress(mbox);
0158         const auto id = qHash(pa);
0159         try {
0160             const auto doc = m_contactDb->get_document(id);
0161             Q_UNUSED(doc);
0162             continue;
0163         } catch (const Xapian::DocNotFoundError &) {
0164             Xapian::Document doc;
0165             const auto pretty(pa.toStdString());
0166             doc.set_data(pretty);
0167 
0168             Xapian::TermGenerator termGen;
0169             termGen.set_document(doc);
0170             termGen.index_text(pretty);
0171 
0172             doc.add_term(mbox.address().data());
0173             m_contactDb->replace_document(id, doc);
0174         }
0175     }
0176 }
0177 
0178 // FIXME: Only index properties that are actually searched!
0179 void EmailIndexer::process(const KMime::Message::Ptr &msg)
0180 {
0181     //
0182     // Process Headers
0183     // (Give the subject a higher priority)
0184     KMime::Headers::Subject *subject = msg->subject(false);
0185     if (subject) {
0186         const std::string str{normalizeString(subject->asUnicodeString()).toStdString()};
0187         qCDebug(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "Indexing" << str.c_str();
0188         m_termGen->index_text_without_positions(str, 1, "SU");
0189         m_termGen->index_text_without_positions(str, 100);
0190         m_doc->set_data(str);
0191     }
0192 
0193     KMime::Headers::Date *date = msg->date(false);
0194     if (date) {
0195         const QString str = QString::number(date->dateTime().toSecsSinceEpoch());
0196         m_doc->add_value(0, str.toStdString());
0197         const QString julianDay = QString::number(date->dateTime().date().toJulianDay());
0198         m_doc->add_value(2, julianDay.toStdString());
0199     }
0200 
0201     insert("F", msg->from(false));
0202     insert("T", msg->to(false));
0203     insert("CC", msg->cc(false));
0204     insert("BC", msg->bcc(false));
0205     insert("O", msg->organization(false));
0206     insert("RT", msg->replyTo(false));
0207     insert("RF", msg->headerByType("Resent-From"));
0208     insert("LI", msg->headerByType("List-Id"));
0209     insert("XL", msg->headerByType("X-Loop"));
0210     insert("XML", msg->headerByType("X-Mailing-List"));
0211     insert("XSF", msg->headerByType("X-Spam-Flag"));
0212 
0213     //
0214     // Process Plain Text Content
0215     //
0216 
0217     // Index all headers
0218     m_termGen->index_text_without_positions(std::string(msg->head().constData()), 1, "HE");
0219 
0220     KMime::Content *mainBody = msg->mainBodyPart("text/plain");
0221     if (mainBody) {
0222         const std::string text(normalizeString(mainBody->decodedText()).toStdString());
0223         m_termGen->index_text_without_positions(text);
0224         m_termGen->index_text_without_positions(text, 1, "BO");
0225     } else {
0226         processPart(msg.data(), nullptr);
0227     }
0228 }
0229 
0230 void EmailIndexer::processPart(KMime::Content *content, KMime::Content *mainContent)
0231 {
0232     if (content == mainContent) {
0233         return;
0234     }
0235 
0236     KMime::Headers::ContentType *type = content->contentType(false);
0237     if (type) {
0238         if (type->isMultipart()) {
0239             if (type->isSubtype("encrypted")) {
0240                 return;
0241             }
0242 
0243             for (KMime::Content *c : content->contents()) {
0244                 processPart(c, mainContent);
0245             }
0246         }
0247 
0248         // Only get HTML content, if no plain text content
0249         if (!mainContent && type->isHTMLText()) {
0250             QProcess converter;
0251             converter.start(QStringLiteral("akonadi_html_to_text"));
0252             if (!converter.waitForStarted()) {
0253                 return;
0254             }
0255 
0256             converter.write(content->decodedText().toUtf8());
0257             converter.closeWriteChannel();
0258 
0259             if (!converter.waitForFinished()) {
0260                 return;
0261             }
0262 
0263             const auto text = converter.readAll().toStdString();
0264 
0265             m_termGen->index_text_without_positions(text);
0266         }
0267     }
0268 
0269     // FIXME: Handle attachments?
0270 }
0271 
0272 void EmailIndexer::processMessageStatus(Akonadi::MessageStatus status)
0273 {
0274     insertBool('R', status.isRead());
0275     insertBool('A', status.hasAttachment());
0276     insertBool('I', status.isImportant());
0277     insertBool('W', status.isWatched());
0278     insertBool('T', status.isToAct());
0279     insertBool('D', status.isDeleted());
0280     insertBool('S', status.isSpam());
0281     insertBool('E', status.isReplied());
0282     insertBool('G', status.isIgnored());
0283     insertBool('F', status.isForwarded());
0284     insertBool('N', status.isSent());
0285     insertBool('Q', status.isQueued());
0286     insertBool('H', status.isHam());
0287     insertBool('C', status.isEncrypted());
0288     insertBool('V', status.hasInvitation());
0289 }
0290 
0291 void EmailIndexer::insertBool(char key, bool value)
0292 {
0293     QByteArray term("B");
0294     if (value) {
0295         term.append(key);
0296     } else {
0297         term.append('N');
0298         term.append(key);
0299     }
0300 
0301     m_doc->add_boolean_term(term.data());
0302 }
0303 
0304 void EmailIndexer::toggleFlag(Xapian::Document &doc, const char *remove, const char *add)
0305 {
0306     try {
0307         doc.remove_term(remove);
0308     } catch (const Xapian::InvalidArgumentError &e) {
0309         // The previous flag state was not indexed, continue
0310     }
0311 
0312     doc.add_term(add);
0313 }
0314 
0315 void EmailIndexer::updateFlags(const Akonadi::Item &item, const QSet<QByteArray> &added, const QSet<QByteArray> &removed)
0316 {
0317     if (!m_db) {
0318         return;
0319     }
0320     Xapian::Document doc;
0321     try {
0322         doc = m_db->get_document(item.id());
0323     } catch (const Xapian::DocNotFoundError &) {
0324         return;
0325     }
0326 
0327     for (const QByteArray &flag : removed) {
0328         if (flag == Akonadi::MessageFlags::Seen) {
0329             toggleFlag(doc, "BR", "BNR");
0330         } else if (flag == Akonadi::MessageFlags::Flagged) {
0331             toggleFlag(doc, "BI", "BNI");
0332         } else if (flag == Akonadi::MessageFlags::Watched) {
0333             toggleFlag(doc, "BW", "BNW");
0334         }
0335     }
0336 
0337     for (const QByteArray &flag : added) {
0338         if (flag == Akonadi::MessageFlags::Seen) {
0339             toggleFlag(doc, "BNR", "BR");
0340         } else if (flag == Akonadi::MessageFlags::Flagged) {
0341             toggleFlag(doc, "BNI", "BI");
0342         } else if (flag == Akonadi::MessageFlags::Watched) {
0343             toggleFlag(doc, "BNW", "BW");
0344         }
0345     }
0346 
0347     m_db->replace_document(doc.get_docid(), doc);
0348 }
0349 
0350 void EmailIndexer::remove(const Akonadi::Item &item)
0351 {
0352     if (!m_db) {
0353         return;
0354     }
0355     try {
0356         m_db->delete_document(item.id());
0357         // TODO remove contacts from contact db?
0358     } catch (const Xapian::DocNotFoundError &) {
0359         return;
0360     }
0361 }
0362 
0363 void EmailIndexer::remove(const Akonadi::Collection &collection)
0364 {
0365     if (!m_db) {
0366         return;
0367     }
0368     try {
0369         Xapian::Query query('C' + QString::number(collection.id()).toStdString());
0370         Xapian::Enquire enquire(*m_db);
0371         enquire.set_query(query);
0372 
0373         Xapian::MSet mset = enquire.get_mset(0, m_db->get_doccount());
0374         Xapian::MSetIterator end = mset.end();
0375         for (Xapian::MSetIterator it = mset.begin(); it != end; ++it) {
0376             const qint64 id = *it;
0377             remove(Akonadi::Item(id));
0378         }
0379     } catch (const Xapian::DocNotFoundError &) {
0380         return;
0381     }
0382 }
0383 
0384 void EmailIndexer::move(Akonadi::Item::Id itemId, Akonadi::Collection::Id from, Akonadi::Collection::Id to)
0385 {
0386     if (!m_db) {
0387         return;
0388     }
0389     Xapian::Document doc;
0390     try {
0391         doc = m_db->get_document(itemId);
0392     } catch (const Xapian::DocNotFoundError &) {
0393         return;
0394     }
0395 
0396     const QByteArray ft = 'C' + QByteArray::number(from);
0397     const QByteArray tt = 'C' + QByteArray::number(to);
0398 
0399     doc.remove_term(ft.data());
0400     doc.add_boolean_term(tt.data());
0401     m_db->replace_document(doc.get_docid(), doc);
0402 }
0403 
0404 void EmailIndexer::commit()
0405 {
0406     if (m_db) {
0407         try {
0408             m_db->commit();
0409         } catch (const Xapian::Error &err) {
0410             qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << err.get_error_string();
0411         }
0412         qCDebug(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "Xapian Committed";
0413     }
0414 
0415     if (m_contactDb) {
0416         try {
0417             m_contactDb->commit();
0418         } catch (const Xapian::Error &err) {
0419             qCWarning(AKONADI_INDEXER_AGENT_EMAIL_LOG) << err.get_error_string();
0420         }
0421         qCDebug(AKONADI_INDEXER_AGENT_EMAIL_LOG) << "Xapian Committed";
0422     }
0423 }