File indexing completed on 2024-03-24 03:54:30
0001 /* 0002 This file is part of the KDE Baloo project. 0003 SPDX-FileCopyrightText: 2015 Vishesh Handa <vhanda@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.1-or-later 0006 */ 0007 0008 #include "transaction.h" 0009 #include "documentdb.h" 0010 #include "documenturldb.h" 0011 #include "documentiddb.h" 0012 #include "positiondb.h" 0013 #include "documentdatadb.h" 0014 0015 #include "document.h" 0016 #include "enginequery.h" 0017 0018 #include "andpostingiterator.h" 0019 #include "orpostingiterator.h" 0020 #include "phraseanditerator.h" 0021 0022 #include "idutils.h" 0023 #include "database.h" 0024 #include "databasesize.h" 0025 0026 #include "enginedebug.h" 0027 0028 #include <QFile> 0029 #include <QFileInfo> 0030 0031 #include <iostream> 0032 0033 using namespace Baloo; 0034 0035 Transaction::Transaction(const Database& db, Transaction::TransactionType type) 0036 : m_dbis(db.m_dbis) 0037 , m_env(db.m_env) 0038 { 0039 init(type); 0040 } 0041 0042 void Transaction::reset(TransactionType type) 0043 { 0044 if (m_txn) { 0045 qWarning(ENGINE) << "Resetting a Transaction without calling abort/commit"; 0046 abort(); 0047 } 0048 init(type); 0049 } 0050 0051 void Transaction::init(TransactionType type) 0052 { 0053 uint flags = type == ReadOnly ? MDB_RDONLY : 0; 0054 int rc = mdb_txn_begin(m_env, nullptr, flags, &m_txn); 0055 if (rc) { 0056 qCDebug(ENGINE) << "Transaction" << mdb_strerror(rc); 0057 return; 0058 } 0059 0060 if (type == ReadWrite) { 0061 m_writeTrans = std::make_unique<WriteTransaction>(m_dbis, m_txn); 0062 } 0063 } 0064 0065 Transaction::Transaction(Database* db, Transaction::TransactionType type) 0066 : Transaction(*db, type) 0067 { 0068 } 0069 0070 Transaction::~Transaction() 0071 { 0072 if (m_writeTrans) { 0073 qWarning(ENGINE) << "Closing an active WriteTransaction without calling abort/commit"; 0074 } 0075 0076 if (m_txn) { 0077 abort(); 0078 } 0079 } 0080 0081 bool Transaction::hasDocument(quint64 id) const 0082 { 0083 Q_ASSERT(id > 0); 0084 0085 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); 0086 return docUrlDb.contains(id); 0087 } 0088 0089 bool Transaction::inPhaseOne(quint64 id) const 0090 { 0091 Q_ASSERT(id > 0); 0092 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); 0093 return contentIndexingDb.contains(id); 0094 } 0095 0096 bool Transaction::hasFailed(quint64 id) const 0097 { 0098 Q_ASSERT(id > 0); 0099 DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); 0100 return failedIdDb.contains(id); 0101 } 0102 0103 QVector<quint64> Transaction::failedIds(quint64 limit) const 0104 { 0105 DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); 0106 return failedIdDb.fetchItems(limit); 0107 } 0108 0109 QByteArray Transaction::documentUrl(quint64 id) const 0110 { 0111 Q_ASSERT(m_txn); 0112 Q_ASSERT(id > 0); 0113 0114 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); 0115 return docUrlDb.get(id); 0116 } 0117 0118 quint64 Transaction::documentId(const QByteArray& path) const 0119 { 0120 Q_ASSERT(m_txn); 0121 Q_ASSERT(!path.isEmpty()); 0122 0123 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); 0124 QList<QByteArray> li = path.split('/'); 0125 0126 quint64 parentId = 0; 0127 for (const QByteArray& fileName : li) { 0128 if (fileName.isEmpty()) { 0129 continue; 0130 } 0131 0132 parentId = docUrlDb.getId(parentId, fileName); 0133 if (!parentId) { 0134 return 0; 0135 } 0136 } 0137 0138 return parentId; 0139 } 0140 0141 DocumentTimeDB::TimeInfo Transaction::documentTimeInfo(quint64 id) const 0142 { 0143 Q_ASSERT(m_txn); 0144 0145 DocumentTimeDB docTimeDb(m_dbis.docTimeDbi, m_txn); 0146 return docTimeDb.get(id); 0147 } 0148 0149 QByteArray Transaction::documentData(quint64 id) const 0150 { 0151 Q_ASSERT(m_txn); 0152 Q_ASSERT(id > 0); 0153 0154 DocumentDataDB docDataDb(m_dbis.docDataDbi, m_txn); 0155 return docDataDb.get(id); 0156 } 0157 0158 QVector<quint64> Transaction::fetchPhaseOneIds(int size) const 0159 { 0160 Q_ASSERT(m_txn); 0161 Q_ASSERT(size > 0); 0162 0163 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); 0164 return contentIndexingDb.fetchItems(size); 0165 } 0166 0167 QVector<QByteArray> Transaction::fetchTermsStartingWith(const QByteArray& term) const 0168 { 0169 Q_ASSERT(term.size() > 0); 0170 0171 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0172 return postingDb.fetchTermsStartingWith(term); 0173 } 0174 0175 uint Transaction::phaseOneSize() const 0176 { 0177 Q_ASSERT(m_txn); 0178 0179 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); 0180 return contentIndexingDb.size(); 0181 } 0182 0183 uint Transaction::size() const 0184 { 0185 Q_ASSERT(m_txn); 0186 0187 DocumentDB docTermsDb(m_dbis.docTermsDbi, m_txn); 0188 return docTermsDb.size(); 0189 } 0190 0191 // 0192 // Write Operations 0193 // 0194 void Transaction::setPhaseOne(quint64 id) 0195 { 0196 Q_ASSERT(m_txn); 0197 Q_ASSERT(id > 0); 0198 Q_ASSERT(m_writeTrans); 0199 0200 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); 0201 contentIndexingDb.put(id); 0202 } 0203 0204 void Transaction::removePhaseOne(quint64 id) 0205 { 0206 Q_ASSERT(m_txn); 0207 Q_ASSERT(id > 0); 0208 Q_ASSERT(m_writeTrans); 0209 0210 DocumentIdDB contentIndexingDb(m_dbis.contentIndexingDbi, m_txn); 0211 contentIndexingDb.del(id); 0212 } 0213 0214 void Transaction::addFailed(quint64 id) 0215 { 0216 Q_ASSERT(m_txn); 0217 Q_ASSERT(id > 0); 0218 Q_ASSERT(m_writeTrans); 0219 0220 DocumentIdDB failedIdDb(m_dbis.failedIdDbi, m_txn); 0221 failedIdDb.put(id); 0222 } 0223 0224 void Transaction::addDocument(const Document& doc) 0225 { 0226 Q_ASSERT(m_txn); 0227 Q_ASSERT(doc.id() > 0); 0228 if (!m_writeTrans) { 0229 qCWarning(ENGINE) << "m_writeTrans is null"; 0230 return; 0231 } 0232 0233 m_writeTrans->addDocument(doc); 0234 } 0235 0236 void Transaction::removeDocument(quint64 id) 0237 { 0238 Q_ASSERT(m_txn); 0239 Q_ASSERT(id > 0); 0240 if (!m_writeTrans) { 0241 qCWarning(ENGINE) << "m_writeTrans is null"; 0242 return; 0243 } 0244 0245 m_writeTrans->removeDocument(id); 0246 } 0247 0248 void Transaction::removeRecursively(quint64 id) 0249 { 0250 Q_ASSERT(m_txn); 0251 Q_ASSERT(id > 0); 0252 if (!m_writeTrans) { 0253 qCWarning(ENGINE) << "m_writeTrans is null"; 0254 return; 0255 } 0256 0257 m_writeTrans->removeRecursively(id); 0258 } 0259 0260 void Transaction::replaceDocument(const Document& doc, DocumentOperations operations) 0261 { 0262 Q_ASSERT(m_txn); 0263 Q_ASSERT(doc.id() > 0); 0264 Q_ASSERT(m_writeTrans); 0265 if (!hasDocument(doc.id())) { 0266 qCDebug(ENGINE) << "Transaction::replaceDocument" << "Document does not exist"; 0267 } 0268 0269 if (!m_writeTrans) { 0270 qCWarning(ENGINE) << "m_writeTrans is null"; 0271 return; 0272 } 0273 0274 m_writeTrans->replaceDocument(doc, operations); 0275 } 0276 0277 bool Transaction::commit() 0278 { 0279 Q_ASSERT(m_txn); 0280 if (!m_writeTrans) { 0281 qCWarning(ENGINE) << "m_writeTrans is null"; 0282 return false; 0283 } 0284 0285 m_writeTrans->commit(); 0286 m_writeTrans.reset(); 0287 0288 int rc = mdb_txn_commit(m_txn); 0289 m_txn = nullptr; 0290 0291 if (rc) { 0292 qCWarning(ENGINE) << "Transaction::commit" << mdb_strerror(rc); 0293 return false; 0294 } 0295 0296 return true; 0297 } 0298 0299 void Transaction::abort() 0300 { 0301 Q_ASSERT(m_txn); 0302 0303 mdb_txn_abort(m_txn); 0304 m_txn = nullptr; 0305 0306 m_writeTrans.reset(); 0307 } 0308 0309 // 0310 // Queries 0311 // 0312 0313 PostingIterator* Transaction::postingIterator(const EngineQuery& query) const 0314 { 0315 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0316 PositionDB positionDb(m_dbis.positionDBi, m_txn); 0317 0318 if (query.leaf()) { 0319 if (query.op() == EngineQuery::Equal) { 0320 return postingDb.iter(query.term()); 0321 } else if (query.op() == EngineQuery::StartsWith) { 0322 return postingDb.prefixIter(query.term()); 0323 } else { 0324 Q_ASSERT(0); 0325 } 0326 } 0327 0328 const auto subQueries = query.subQueries(); 0329 if (subQueries.isEmpty()) { 0330 return nullptr; 0331 } 0332 0333 Q_ASSERT(query.op() == EngineQuery::Phrase); 0334 if (query.op() == EngineQuery::Phrase) { 0335 if (subQueries.size() == 1) { 0336 qCDebug(ENGINE) << "Degenerated Phrase with 1 Term:" << query; 0337 return postingIterator(subQueries[0]); 0338 } 0339 QVector<VectorPositionInfoIterator*> vec; 0340 vec.reserve(subQueries.size()); 0341 for (const EngineQuery& q : subQueries) { 0342 if (!q.leaf()) { 0343 qCDebug(ENGINE) << "Transaction::toPostingIterator" << "Phrase subqueries must be leafs"; 0344 continue; 0345 } 0346 auto termMatch = positionDb.iter(q.term()); 0347 if (!termMatch) { 0348 return nullptr; 0349 } 0350 vec << termMatch; 0351 } 0352 0353 return new PhraseAndIterator(vec); 0354 } 0355 0356 return nullptr; 0357 } 0358 0359 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, qlonglong value, PostingDB::Comparator com) const 0360 { 0361 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0362 return postingDb.compIter(prefix, value, com); 0363 } 0364 0365 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, double value, PostingDB::Comparator com) const 0366 { 0367 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0368 return postingDb.compIter(prefix, value, com); 0369 } 0370 0371 PostingIterator* Transaction::postingCompIterator(const QByteArray& prefix, const QByteArray& value, PostingDB::Comparator com) const 0372 { 0373 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0374 return postingDb.compIter(prefix, value, com); 0375 } 0376 0377 PostingIterator* Transaction::mTimeRangeIter(quint32 beginTime, quint32 endTime) const 0378 { 0379 MTimeDB mTimeDb(m_dbis.mtimeDbi, m_txn); 0380 return mTimeDb.iterRange(beginTime, endTime); 0381 } 0382 0383 PostingIterator* Transaction::docUrlIter(quint64 id) const 0384 { 0385 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); 0386 return docUrlDb.iter(id); 0387 } 0388 0389 // 0390 // Introspection 0391 // 0392 0393 QVector<QByteArray> Transaction::documentTerms(quint64 docId) const 0394 { 0395 Q_ASSERT(docId); 0396 0397 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); 0398 return documentTermsDB.get(docId); 0399 } 0400 0401 QVector<QByteArray> Transaction::documentFileNameTerms(quint64 docId) const 0402 { 0403 Q_ASSERT(docId); 0404 0405 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); 0406 return documentFileNameTermsDB.get(docId); 0407 } 0408 0409 QVector<QByteArray> Transaction::documentXattrTerms(quint64 docId) const 0410 { 0411 Q_ASSERT(docId); 0412 0413 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); 0414 return documentXattrTermsDB.get(docId); 0415 } 0416 0417 // 0418 // File Size 0419 // 0420 static size_t dbiSize(MDB_txn* txn, MDB_dbi dbi) 0421 { 0422 MDB_stat stat; 0423 mdb_stat(txn, dbi, &stat); 0424 0425 return (stat.ms_branch_pages + stat.ms_leaf_pages + stat.ms_overflow_pages) * stat.ms_psize; 0426 } 0427 0428 DatabaseSize Transaction::dbSize() 0429 { 0430 DatabaseSize dbSize; 0431 dbSize.postingDb = dbiSize(m_txn, m_dbis.postingDbi); 0432 dbSize.positionDb = dbiSize(m_txn, m_dbis.positionDBi); 0433 dbSize.docTerms = dbiSize(m_txn, m_dbis.docTermsDbi); 0434 dbSize.docFilenameTerms = dbiSize(m_txn, m_dbis.docFilenameTermsDbi); 0435 dbSize.docXattrTerms = dbiSize(m_txn, m_dbis.docXattrTermsDbi); 0436 0437 dbSize.idTree = dbiSize(m_txn, m_dbis.idTreeDbi); 0438 dbSize.idFilename = dbiSize(m_txn, m_dbis.idFilenameDbi); 0439 0440 dbSize.docTime = dbiSize(m_txn, m_dbis.docTimeDbi); 0441 dbSize.docData = dbiSize(m_txn, m_dbis.docDataDbi); 0442 0443 dbSize.contentIndexingIds = dbiSize(m_txn, m_dbis.contentIndexingDbi); 0444 dbSize.failedIds = dbiSize(m_txn, m_dbis.failedIdDbi); 0445 0446 dbSize.mtimeDb = dbiSize(m_txn, m_dbis.mtimeDbi); 0447 0448 dbSize.expectedSize = dbSize.postingDb + dbSize.positionDb + dbSize.docTerms + dbSize.docFilenameTerms 0449 + dbSize.docXattrTerms + dbSize.idTree + dbSize.idFilename + dbSize.docTime 0450 + dbSize.docData + dbSize.contentIndexingIds + dbSize.failedIds + dbSize.mtimeDb; 0451 0452 MDB_envinfo info; 0453 mdb_env_info(m_env, &info); 0454 dbSize.actualSize = info.me_last_pgno * 4096; // TODO: separate page size 0455 0456 return dbSize; 0457 } 0458 0459 // 0460 // Debugging 0461 // 0462 void Transaction::checkFsTree() 0463 { 0464 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); 0465 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); 0466 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); 0467 DocumentUrlDB docUrlDb(m_dbis.idTreeDbi, m_dbis.idFilenameDbi, m_txn); 0468 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0469 0470 const auto map = postingDb.toTestMap(); 0471 0472 QSet<quint64> allIds; 0473 for (const auto& list : map) { 0474 for (quint64 id : list) { 0475 allIds << id; 0476 } 0477 } 0478 0479 std::cout << "Total Document IDs: " << allIds.size() << std::endl; 0480 0481 int count = 0; 0482 for (quint64 id: std::as_const(allIds)) { 0483 QByteArray url = docUrlDb.get(id); 0484 if (url.isEmpty()) { 0485 auto terms = documentTermsDB.get(id); 0486 auto fileNameTerms = documentFileNameTermsDB.get(id); 0487 auto xAttrTerms = documentXattrTermsDB.get(id); 0488 0489 // Lets reverse engineer the terms 0490 QList<QByteArray> newTerms; 0491 QMapIterator<QByteArray, PostingList> it(map); 0492 while (it.hasNext()) { 0493 it.next(); 0494 if (it.value().contains(id)) { 0495 newTerms << it.key(); 0496 } 0497 } 0498 0499 std::cout << "Missing filePath for " << id << std::endl; 0500 std::cout << "\tPostingDB Terms: "; 0501 for (const QByteArray& term : std::as_const(newTerms)) { 0502 std::cout << qPrintable(QString::fromUtf8(term)) << " "; 0503 } 0504 std::cout << std::endl; 0505 0506 std::cout << "\tDocumentTermsDB: "; 0507 for (const QByteArray& term : terms) { 0508 std::cout << qPrintable(QString::fromUtf8(term)) << " "; 0509 } 0510 std::cout << std::endl; 0511 0512 std::cout << "\tFileNameTermsDB: "; 0513 for (const QByteArray& term : fileNameTerms) { 0514 std::cout << qPrintable(QString::fromUtf8(term)) << " "; 0515 } 0516 std::cout << std::endl; 0517 0518 std::cout << "\tXAttrTermsDB: "; 0519 for (const QByteArray& term : xAttrTerms) { 0520 std::cout << qPrintable(QString::fromUtf8(term)) << " "; 0521 } 0522 std::cout << std::endl; 0523 0524 count++; 0525 } else if (!QFileInfo::exists(QString::fromUtf8(url))) { 0526 std::cout << "FilePath " << qPrintable(QString::fromUtf8(url)) << " for " << id << " does not exist"<< std::endl; 0527 count++; 0528 } 0529 } 0530 0531 std::cout << "Invalid Entries: " << count << " (" << count * 100.0 / allIds.size() << "%)" << std::endl; 0532 } 0533 0534 void Transaction::checkTermsDbinPostingDb() 0535 { 0536 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); 0537 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); 0538 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); 0539 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0540 0541 // Iterate over each document, and fetch all terms 0542 // check if each term maps to its own id in the posting db 0543 0544 const auto map = postingDb.toTestMap(); 0545 0546 QSet<quint64> allIds; 0547 for (const auto& list : map) { 0548 for (quint64 id : list) { 0549 allIds << id; 0550 } 0551 } 0552 0553 std::cout << "PostingDB check .." << std::endl; 0554 for (quint64 id : std::as_const(allIds)) { 0555 QVector<QByteArray> terms = documentTermsDB.get(id); 0556 terms += documentXattrTermsDB.get(id); 0557 terms += documentFileNameTermsDB.get(id); 0558 0559 for (const QByteArray& term : std::as_const(terms)) { 0560 PostingList plist = postingDb.get(term); 0561 if (!plist.contains(id)) { 0562 std::cout << id << " is missing term " << qPrintable(QString::fromUtf8(term)) << std::endl; 0563 } 0564 } 0565 } 0566 } 0567 0568 void Transaction::checkPostingDbinTermsDb() 0569 { 0570 DocumentDB documentTermsDB(m_dbis.docTermsDbi, m_txn); 0571 DocumentDB documentXattrTermsDB(m_dbis.docXattrTermsDbi, m_txn); 0572 DocumentDB documentFileNameTermsDB(m_dbis.docFilenameTermsDbi, m_txn); 0573 PostingDB postingDb(m_dbis.postingDbi, m_txn); 0574 0575 QMap<QByteArray, PostingList> map = postingDb.toTestMap(); 0576 QMapIterator<QByteArray, PostingList> it(map); 0577 0578 std::cout << "DocumentTermsDB check .." << std::endl; 0579 while (it.hasNext()) { 0580 it.next(); 0581 0582 const QByteArray& term = it.key(); 0583 const PostingList& list = it.value(); 0584 for (quint64 id : list) { 0585 if (documentTermsDB.get(id).contains(term)) { 0586 continue; 0587 } 0588 if (documentFileNameTermsDB.get(id).contains(term)) { 0589 continue; 0590 } 0591 if (documentXattrTermsDB.get(id).contains(term)) { 0592 continue; 0593 } 0594 std::cout << id << " is missing " << qPrintable(QString::fromUtf8(term)) << " from document terms db" << std::endl; 0595 } 0596 } 0597 }