File indexing completed on 2024-04-28 15:17:32

0001 /*
0002     This file is part of the KDE Baloo project.
0003     SPDX-FileCopyrightText: 2015 Vishesh Handa <vhanda@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-or-later
0006 */
0007 
0008 #include "transaction.h"
0009 #include "documentdb.h"
0010 #include "documenturldb.h"
0011 #include "documentiddb.h"
0012 #include "positiondb.h"
0013 #include "documentdatadb.h"
0014 
0015 #include "document.h"
0016 #include "enginequery.h"
0017 
0018 #include "andpostingiterator.h"
0019 #include "orpostingiterator.h"
0020 #include "phraseanditerator.h"
0021 
0022 #include "idutils.h"
0023 #include "database.h"
0024 #include "databasesize.h"
0025 
0026 #include "enginedebug.h"
0027 
0028 #include <QFile>
0029 #include <QFileInfo>
0030 
0031 #include <iostream>
0032 
0033 using namespace Baloo;
0034 
0035 Transaction::Transaction(const Database& db, Transaction::TransactionType type)
0036     : m_dbis(db.m_dbis)
0037     , m_env(db.m_env)
0038 {
0039     init(type);
0040 }
0041 
0042 void Transaction::reset(TransactionType type)
0043 {
0044     if (m_txn) {
0045         qWarning(ENGINE) << "Resetting a Transaction without calling abort/commit";
0046         abort();
0047     }
0048     init(type);
0049 }
0050 
0051 void Transaction::init(TransactionType type)
0052 {
0053     uint flags = type == ReadOnly ? MDB_RDONLY : 0;
0054     int rc = mdb_txn_begin(m_env, nullptr, flags, &m_txn);
0055     if (rc) {
0056         qCDebug(ENGINE) << "Transaction" << mdb_strerror(rc);
0057         return;
0058     }
0059 
0060     if (type == ReadWrite) {
0061         m_writeTrans = std::make_unique<WriteTransaction>(m_dbis, m_txn);
0062     }
0063 }
0064 
0065 Transaction::Transaction(Database* db, Transaction::TransactionType type)
0066     : Transaction(*db, type)
0067 {
0068 }
0069 
0070 Transaction::~Transaction()
0071 {
0072     if (m_writeTrans) {
0073         qWarning(ENGINE) << "Closing an active WriteTransaction without calling abort/commit";
0074     }
0075 
0076     if (m_txn) {
0077         abort();
0078     }
0079 }
0080 
0081 bool Transaction::hasDocument(quint64 id) const
0082 {
0083     Q_ASSERT(id > 0);
0084 
0085     DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
0086     return docUrlDb.contains(id);
0087 }
0088 
0089 bool Transaction::inPhaseOne(quint64 id) const
0090 {
0091     Q_ASSERT(id > 0);
0092     DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
0093     return contentIndexingDb.contains(id);
0094 }
0095 
0096 bool Transaction::hasFailed(quint64 id) const
0097 {
0098     Q_ASSERT(id > 0);
0099     DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
0100     return failedIdDb.contains(id);
0101 }
0102 
0103 QVector<quint64> Transaction::failedIds(quint64 limit) const
0104 {
0105     DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
0106     return failedIdDb.fetchItems(limit);
0107 }
0108 
0109 QByteArray Transaction::documentUrl(quint64 id) const
0110 {
0111     Q_ASSERT(m_txn);
0112     Q_ASSERT(id > 0);
0113 
0114     DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
0115     return docUrlDb.get(id);
0116 }
0117 
0118 quint64 Transaction::documentId(const QByteArray& path) const
0119 {
0120     Q_ASSERT(m_txn);
0121     Q_ASSERT(!path.isEmpty());
0122 
0123     DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
0124     QList<QByteArray> li = path.split('/');
0125 
0126     quint64 parentId = 0;
0127     for (const QByteArray& fileName : li) {
0128         if (fileName.isEmpty()) {
0129             continue;
0130         }
0131 
0132         parentId = docUrlDb.getId(parentId, fileName);
0133         if (!parentId) {
0134             return 0;
0135         }
0136     }
0137 
0138     return parentId;
0139 }
0140 
0141 DocumentTimeDB::TimeInfo Transaction::documentTimeInfo(quint64 id) const
0142 {
0143     Q_ASSERT(m_txn);
0144 
0145     DocumentTimeDB docTimeDb(m_dbis.docTimeDbi, m_txn);
0146     return docTimeDb.get(id);
0147 }
0148 
0149 QByteArray Transaction::documentData(quint64 id) const
0150 {
0151     Q_ASSERT(m_txn);
0152     Q_ASSERT(id > 0);
0153 
0154     DocumentDataDB docDataDb(m_dbis.docDataDbi, m_txn);
0155     return docDataDb.get(id);
0156 }
0157 
0158 QVector<quint64> Transaction::fetchPhaseOneIds(int size) const
0159 {
0160     Q_ASSERT(m_txn);
0161     Q_ASSERT(size > 0);
0162 
0163     DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
0164     return contentIndexingDb.fetchItems(size);
0165 }
0166 
0167 QVector<QByteArray> Transaction::fetchTermsStartingWith(const QByteArray& term) const
0168 {
0169     Q_ASSERT(term.size() > 0);
0170 
0171     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0172     return postingDb.fetchTermsStartingWith(term);
0173 }
0174 
0175 uint Transaction::phaseOneSize() const
0176 {
0177     Q_ASSERT(m_txn);
0178 
0179     DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
0180     return contentIndexingDb.size();
0181 }
0182 
0183 uint Transaction::size() const
0184 {
0185     Q_ASSERT(m_txn);
0186 
0187     DocumentDB docTermsDb(m_dbis.docTermsDbi, m_txn);
0188     return docTermsDb.size();
0189 }
0190 
0191 //
0192 // Write Operations
0193 //
0194 void Transaction::setPhaseOne(quint64 id)
0195 {
0196     Q_ASSERT(m_txn);
0197     Q_ASSERT(id > 0);
0198     Q_ASSERT(m_writeTrans);
0199 
0200     DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
0201     contentIndexingDb.put(id);
0202 }
0203 
0204 void Transaction::removePhaseOne(quint64 id)
0205 {
0206     Q_ASSERT(m_txn);
0207     Q_ASSERT(id > 0);
0208     Q_ASSERT(m_writeTrans);
0209 
0210     DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn);
0211     contentIndexingDb.del(id);
0212 }
0213 
0214 void Transaction::addFailed(quint64 id)
0215 {
0216     Q_ASSERT(m_txn);
0217     Q_ASSERT(id > 0);
0218     Q_ASSERT(m_writeTrans);
0219 
0220     DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn);
0221     failedIdDb.put(id);
0222 }
0223 
0224 void Transaction::addDocument(const Document& doc)
0225 {
0226     Q_ASSERT(m_txn);
0227     Q_ASSERT(doc.id() > 0);
0228     if (!m_writeTrans) {
0229         qCWarning(ENGINE) << "m_writeTrans is null";
0230         return;
0231     }
0232 
0233     m_writeTrans->addDocument(doc);
0234 }
0235 
0236 void Transaction::removeDocument(quint64 id)
0237 {
0238     Q_ASSERT(m_txn);
0239     Q_ASSERT(id > 0);
0240     if (!m_writeTrans) {
0241         qCWarning(ENGINE) << "m_writeTrans is null";
0242         return;
0243     }
0244 
0245     m_writeTrans->removeDocument(id);
0246 }
0247 
0248 void Transaction::removeRecursively(quint64 id)
0249 {
0250     Q_ASSERT(m_txn);
0251     Q_ASSERT(id > 0);
0252     if (!m_writeTrans) {
0253         qCWarning(ENGINE) << "m_writeTrans is null";
0254         return;
0255     }
0256 
0257     m_writeTrans->removeRecursively(id);
0258 }
0259 
0260 void Transaction::replaceDocument(const Document& doc, DocumentOperations operations)
0261 {
0262     Q_ASSERT(m_txn);
0263     Q_ASSERT(doc.id() > 0);
0264     Q_ASSERT(m_writeTrans);
0265     if (!hasDocument(doc.id())) {
0266         qCDebug(ENGINE) << "Transaction::replaceDocument" << "Document does not exist";
0267     }
0268 
0269     if (!m_writeTrans) {
0270         qCWarning(ENGINE) << "m_writeTrans is null";
0271         return;
0272     }
0273 
0274     m_writeTrans->replaceDocument(doc, operations);
0275 }
0276 
0277 bool Transaction::commit()
0278 {
0279     Q_ASSERT(m_txn);
0280     if (!m_writeTrans) {
0281         qCWarning(ENGINE) << "m_writeTrans is null";
0282         return false;
0283     }
0284 
0285     m_writeTrans->commit();
0286     m_writeTrans.reset();
0287 
0288     int rc = mdb_txn_commit(m_txn);
0289     m_txn = nullptr;
0290 
0291     if (rc) {
0292         qCWarning(ENGINE) << "Transaction::commit" << mdb_strerror(rc);
0293         return false;
0294     }
0295 
0296     return true;
0297 }
0298 
0299 void Transaction::abort()
0300 {
0301     Q_ASSERT(m_txn);
0302 
0303     mdb_txn_abort(m_txn);
0304     m_txn = nullptr;
0305 
0306     m_writeTrans.reset();
0307 }
0308 
0309 //
0310 // Queries
0311 //
0312 
0313 PostingIterator* Transaction::postingIterator(const EngineQuery& query) const
0314 {
0315     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0316     PositionDB positionDb(m_dbis.positionDBi, m_txn);
0317 
0318     if (query.leaf()) {
0319         if (query.op() == EngineQuery::Equal) {
0320             return postingDb.iter(query.term());
0321         } else if (query.op() == EngineQuery::StartsWith) {
0322             return postingDb.prefixIter(query.term());
0323         } else {
0324             Q_ASSERT(0);
0325         }
0326     }
0327 
0328     const auto subQueries = query.subQueries();
0329     if (subQueries.isEmpty()) {
0330         return nullptr;
0331     }
0332 
0333     Q_ASSERT(query.op() == EngineQuery::Phrase);
0334     if (query.op() == EngineQuery::Phrase) {
0335         if (subQueries.size() == 1) {
0336             qCDebug(ENGINE) << "Degenerated Phrase with 1 Term:" <<  query;
0337             return postingIterator(subQueries[0]);
0338         }
0339         QVector<VectorPositionInfoIterator*> vec;
0340         vec.reserve(subQueries.size());
0341         for (const EngineQuery& q : subQueries) {
0342             if (!q.leaf()) {
0343                 qCDebug(ENGINE) << "Transaction::toPostingIterator" << "Phrase subqueries must be leafs";
0344                 continue;
0345             }
0346             auto termMatch = positionDb.iter(q.term());
0347             if (!termMatch) {
0348                 return nullptr;
0349             }
0350             vec << termMatch;
0351         }
0352 
0353         return new PhraseAndIterator(vec);
0354     }
0355 
0356     return nullptr;
0357 }
0358 
0359 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const
0360 {
0361     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0362     return postingDb.compIter(prefix, value, com);
0363 }
0364 
0365 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const
0366 {
0367     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0368     return postingDb.compIter(prefix, value, com);
0369 }
0370 
0371 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const
0372 {
0373     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0374     return postingDb.compIter(prefix, value, com);
0375 }
0376 
0377 PostingIterator* Transaction::mTimeRangeIter(quint32 beginTime, quint32 endTime) const
0378 {
0379     MTimeDB mTimeDb(m_dbis.mtimeDbi, m_txn);
0380     return mTimeDb.iterRange(beginTime, endTime);
0381 }
0382 
0383 PostingIterator* Transaction::docUrlIter(quint64 id) const
0384 {
0385     DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
0386     return docUrlDb.iter(id);
0387 }
0388 
0389 //
0390 // Introspection
0391 //
0392 
0393 QVector<QByteArray> Transaction::documentTerms(quint64 docId) const
0394 {
0395     Q_ASSERT(docId);
0396 
0397     DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
0398     return documentTermsDB.get(docId);
0399 }
0400 
0401 QVector<QByteArray> Transaction::documentFileNameTerms(quint64 docId) const
0402 {
0403     Q_ASSERT(docId);
0404 
0405     DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
0406     return documentFileNameTermsDB.get(docId);
0407 }
0408 
0409 QVector<QByteArray> Transaction::documentXattrTerms(quint64 docId) const
0410 {
0411     Q_ASSERT(docId);
0412 
0413     DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
0414     return documentXattrTermsDB.get(docId);
0415 }
0416 
0417 //
0418 // File Size
0419 //
0420 static size_t dbiSize(MDB_txn* txn, MDB_dbi dbi)
0421 {
0422     MDB_stat stat;
0423     mdb_stat(txn, dbi, &stat);
0424 
0425     return (stat.ms_branch_pages + stat.ms_leaf_pages + stat.ms_overflow_pages) * stat.ms_psize;
0426 }
0427 
0428 DatabaseSize Transaction::dbSize()
0429 {
0430     DatabaseSize dbSize;
0431     dbSize.postingDb = dbiSize(m_txn, m_dbis.postingDbi);
0432     dbSize.positionDb = dbiSize(m_txn, m_dbis.positionDBi);
0433     dbSize.docTerms = dbiSize(m_txn, m_dbis.docTermsDbi);
0434     dbSize.docFilenameTerms = dbiSize(m_txn, m_dbis.docFilenameTermsDbi);
0435     dbSize.docXattrTerms = dbiSize(m_txn, m_dbis.docXattrTermsDbi);
0436 
0437     dbSize.idTree = dbiSize(m_txn, m_dbis.idTreeDbi);
0438     dbSize.idFilename = dbiSize(m_txn, m_dbis.idFilenameDbi);
0439 
0440     dbSize.docTime = dbiSize(m_txn, m_dbis.docTimeDbi);
0441     dbSize.docData = dbiSize(m_txn, m_dbis.docDataDbi);
0442 
0443     dbSize.contentIndexingIds = dbiSize(m_txn, m_dbis.contentIndexingDbi);
0444     dbSize.failedIds = dbiSize(m_txn, m_dbis.failedIdDbi);
0445 
0446     dbSize.mtimeDb = dbiSize(m_txn, m_dbis.mtimeDbi);
0447 
0448     dbSize.expectedSize = dbSize.postingDb + dbSize.positionDb + dbSize.docTerms + dbSize.docFilenameTerms
0449                   + dbSize.docXattrTerms + dbSize.idTree + dbSize.idFilename + dbSize.docTime
0450                   + dbSize.docData + dbSize.contentIndexingIds + dbSize.failedIds + dbSize.mtimeDb;
0451 
0452     MDB_envinfo info;
0453     mdb_env_info(m_env, &info);
0454     dbSize.actualSize = info.me_last_pgno * 4096; // TODO: separate page size
0455 
0456     return dbSize;
0457 }
0458 
0459 //
0460 // Debugging
0461 //
0462 void Transaction::checkFsTree()
0463 {
0464     DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
0465     DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
0466     DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
0467     DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn);
0468     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0469 
0470     const auto map = postingDb.toTestMap();
0471 
0472     QSet<quint64> allIds;
0473     for (const auto& list : map) {
0474         for (quint64 id : list) {
0475             allIds << id;
0476         }
0477     }
0478 
0479     std::cout << "Total Document IDs: " << allIds.size() << std::endl;
0480 
0481     int count = 0;
0482     for (quint64 id: std::as_const(allIds)) {
0483         QByteArray url = docUrlDb.get(id);
0484         if (url.isEmpty()) {
0485             auto terms = documentTermsDB.get(id);
0486             auto fileNameTerms = documentFileNameTermsDB.get(id);
0487             auto xAttrTerms = documentXattrTermsDB.get(id);
0488 
0489             // Lets reverse engineer the terms
0490             QList<QByteArray> newTerms;
0491             QMapIterator<QByteArray, PostingList> it(map);
0492             while (it.hasNext()) {
0493                 it.next();
0494                 if (it.value().contains(id)) {
0495                     newTerms << it.key();
0496                 }
0497             }
0498 
0499             std::cout << "Missing filePath for " << id << std::endl;
0500             std::cout << "\tPostingDB Terms: ";
0501             for (const QByteArray& term : std::as_const(newTerms)) {
0502                 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
0503             }
0504             std::cout << std::endl;
0505 
0506             std::cout << "\tDocumentTermsDB: ";
0507             for (const QByteArray& term : terms) {
0508                 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
0509             }
0510             std::cout << std::endl;
0511 
0512             std::cout << "\tFileNameTermsDB: ";
0513             for (const QByteArray& term : fileNameTerms) {
0514                 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
0515             }
0516             std::cout << std::endl;
0517 
0518             std::cout << "\tXAttrTermsDB: ";
0519             for (const QByteArray& term : xAttrTerms) {
0520                 std::cout << qPrintable(QString::fromUtf8(term)) << " ";
0521             }
0522             std::cout << std::endl;
0523 
0524             count++;
0525         } else if (!QFileInfo::exists(QString::fromUtf8(url))) {
0526             std::cout << "FilePath " << qPrintable(QString::fromUtf8(url)) << " for " << id << " does not exist"<< std::endl;
0527             count++;
0528         }
0529     }
0530 
0531     std::cout << "Invalid Entries: " << count << " (" << count * 100.0 / allIds.size() << "%)" << std::endl;
0532 }
0533 
0534 void Transaction::checkTermsDbinPostingDb()
0535 {
0536     DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
0537     DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
0538     DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
0539     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0540 
0541     // Iterate over each document, and fetch all terms
0542     // check if each term maps to its own id in the posting db
0543 
0544     const auto map = postingDb.toTestMap();
0545 
0546     QSet<quint64> allIds;
0547     for (const auto& list : map) {
0548         for (quint64 id : list) {
0549             allIds << id;
0550         }
0551     }
0552 
0553     std::cout << "PostingDB check .." << std::endl;
0554     for (quint64 id : std::as_const(allIds)) {
0555         QVector<QByteArray> terms = documentTermsDB.get(id);
0556         terms += documentXattrTermsDB.get(id);
0557         terms += documentFileNameTermsDB.get(id);
0558 
0559         for (const QByteArray& term : std::as_const(terms)) {
0560             PostingList plist = postingDb.get(term);
0561             if (!plist.contains(id)) {
0562                 std::cout << id << " is missing term " << qPrintable(QString::fromUtf8(term)) << std::endl;
0563             }
0564         }
0565     }
0566 }
0567 
0568 void Transaction::checkPostingDbinTermsDb()
0569 {
0570     DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn);
0571     DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn);
0572     DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn);
0573     PostingDB postingDb(m_dbis.postingDbi, m_txn);
0574 
0575     QMap<QByteArray, PostingList> map = postingDb.toTestMap();
0576     QMapIterator<QByteArray, PostingList> it(map);
0577 
0578     std::cout << "DocumentTermsDB check .." << std::endl;
0579     while (it.hasNext()) {
0580         it.next();
0581 
0582         const QByteArray& term = it.key();
0583         const PostingList& list = it.value();
0584         for (quint64 id : list) {
0585             if (documentTermsDB.get(id).contains(term)) {
0586                 continue;
0587             }
0588             if (documentFileNameTermsDB.get(id).contains(term)) {
0589                 continue;
0590             }
0591             if (documentXattrTermsDB.get(id).contains(term)) {
0592                 continue;
0593             }
0594             std::cout << id << " is missing " << qPrintable(QString::fromUtf8(term)) << " from document terms db" << std::endl;
0595         }
0596     }
0597 }