File indexing completed on 2024-04-28 03:51:42

0001 /*
0002     This file is part of the KDE Baloo project.
0003     SPDX-FileCopyrightText: 2018 Michael Heidelbach <ottwolt@gmail.com>
0004 
0005     SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0006 */
0007 
0008 #include "databasesanitizer.h"
0009 #include "documenturldb.h"
0010 #include "idutils.h"
0011 
0012 #include <sys/sysmacros.h>
0013 
0014 #include <KLocalizedString>
0015 #include <QFileInfo>
0016 #include <QStorageInfo>
0017 #include <QDebug>
0018 
0019 namespace Baloo
0020 {
0021 
0022 class DatabaseSanitizerImpl {
0023 public:
0024     DatabaseSanitizerImpl(const Database& db, Transaction::TransactionType type)
0025     : m_transaction(new Transaction(db, type))
0026     {
0027     }
0028 
0029 public:
0030 
0031     /**
0032     * \brief Basic info about database items
0033     */
0034     struct FileInfo {
0035         quint32 deviceId = 0;
0036         quint32 inode = 0;
0037         quint64 id = 0;
0038         bool isSymLink = false;
0039         bool accessible = true;
0040         QString url;
0041     };
0042 
0043     void printProgress(QTextStream& out, uint& cur, const uint max, const uint step) const
0044     {
0045         if (cur % step == 0) {
0046             out << QStringLiteral("%1%2\r").arg(100 * cur / max, 6).arg("%", -16);
0047             out.flush();
0048         }
0049         cur++;
0050     }
0051 
0052     /**
0053      * Summary of createList() actions
0054      */
0055     struct Summary {
0056         quint64 total = 0;  ///Count of all files
0057         quint64 ignored = 0;      ///Count of filtered out files
0058         quint64 accessible = 0;   ///Count of checked and accessible files
0059     };
0060     /**
0061     * Create a list of \a FileInfo items.
0062     *
0063     * \p deviceIDs filter by device ids. If the vector is empty no filtering is done
0064     * and every item is collected.
0065     * Positive numbers are including filters collecting only the mentioned device ids.
0066     * Negative numbers are excluding filters collecting everything but the mentioned device ids.
0067     *
0068     * \p accessFilter Flags to filter items by accessibility.
0069     *
0070     * \p urlFilter Filter result urls. Default is null = Collect everything.
0071     */
0072     QPair<QVector<FileInfo>, Summary> createList(
0073         const QVector<qint64>& deviceIds,
0074         const DatabaseSanitizer::ItemAccessFilters accessFilter,
0075         const QSharedPointer<QRegularExpression>& urlFilter
0076     ) const
0077     {
0078         Q_ASSERT(m_transaction);
0079 
0080         const auto docUrlDb = DocumentUrlDB(m_transaction->m_dbis.idTreeDbi,
0081                                             m_transaction->m_dbis.idFilenameDbi,
0082                                             m_transaction->m_txn);
0083         const auto map = docUrlDb.toTestMap();
0084         const auto keys = map.keys();
0085         QVector<FileInfo> result;
0086         uint max = map.size();
0087         uint i = 0;
0088         result.reserve(max);
0089         QVector<quint32> includeIds;
0090         QVector<quint32> excludeIds;
0091         for (qint64 deviceId : deviceIds) {
0092             if (deviceId > 0) {
0093                 includeIds.append(deviceId);
0094             } else if (deviceId < 0) {
0095                 excludeIds.append(-deviceId);
0096             }
0097         }
0098         Summary summary;
0099         summary.total = max;
0100         summary.ignored = max;
0101         QTextStream err(stderr);
0102 
0103         for (auto it = map.constBegin(), end = map.constEnd(); it != end; it++) {
0104             printProgress(err, i, max, 100);
0105             const quint64 id = it.key();
0106             const quint32 deviceId = idToDeviceId(id);
0107             if (!includeIds.isEmpty() && !includeIds.contains(deviceId)) {
0108                 continue;
0109             } else if (excludeIds.contains(deviceId)) {
0110                 continue;
0111             } else if (urlFilter && !urlFilter->match(it.value()).hasMatch()) {
0112                 continue;
0113             }
0114 
0115             FileInfo info;
0116             info.deviceId = deviceId;
0117             info.inode = idToInode(id);
0118             info.url = QFile::decodeName(it.value());
0119             info.id = id;
0120             QFileInfo fileInfo(info.url);
0121             info.accessible = !info.url.isEmpty() && fileInfo.exists();
0122 
0123             if (info.accessible && (accessFilter & DatabaseSanitizer::IgnoreAvailable)) {
0124                 continue;
0125             } else if (!info.accessible && (accessFilter & DatabaseSanitizer::IgnoreUnavailable)) {
0126                 continue;
0127             }
0128 
0129             info.isSymLink = fileInfo.isSymLink();
0130 
0131             result.append(info);
0132             summary.ignored--;
0133             if (info.accessible) {
0134                 summary.accessible++;
0135             }
0136         }
0137         return {result, summary};
0138     }
0139 
0140     QStorageInfo getStorageInfo(const quint32 id) {
0141         static QMap<quint32, QStorageInfo> storageInfos = []() {
0142             QMap<quint32, QStorageInfo> result;
0143             const auto volumes = QStorageInfo::mountedVolumes();
0144             for (const auto& vol : volumes) {
0145                 const QByteArray rootPath = QFile::encodeName(vol.rootPath());
0146                 const auto id = filePathToId(rootPath);
0147                 const quint32 deviceId = idToDeviceId(id);
0148                 // qDebug() << vol;
0149                 result[deviceId] = vol;
0150             }
0151             return result;
0152         }();
0153 
0154         QStorageInfo info = storageInfos.value(id);
0155         return info;
0156     }
0157 
0158     QMap<quint32, bool> deviceFilters(QVector<FileInfo>& infos, const DatabaseSanitizer::ItemAccessFilters accessFilter)
0159     {
0160         QMap<quint32, bool> result;
0161         for (const auto& info : infos) {
0162             result[info.deviceId] = false;
0163         }
0164 
0165         for (auto it = result.begin(), end = result.end(); it != end; it++) {
0166             const auto storageInfo = getStorageInfo(it.key());
0167             it.value() = isIgnored(storageInfo, accessFilter);
0168         }
0169         return result;
0170     }
0171 
0172     bool isIgnored(const QStorageInfo& storageInfo, const DatabaseSanitizer::ItemAccessFilters accessFilter)
0173     {
0174         const bool mounted = storageInfo.isValid();
0175         if (mounted && (accessFilter & DatabaseSanitizer::IgnoreMounted)) {
0176             return true;
0177         } else if (!mounted && (accessFilter & DatabaseSanitizer::IgnoreUnmounted)) {
0178             return true;
0179         }
0180 
0181         if (storageInfo.fileSystemType() == QLatin1String("tmpfs")) {
0182             // Due to the volatility of device ids, an id known by baloo may
0183             // appear as mounted, but is not what baloo expects.
0184             // For example at indexing time 43 was the id of a smb share, but
0185             // at runtime 43 is the id of /run/media/<uid> when other users are
0186             // logged in. The latter have a type of 'tmpfs' and should be ignored.
0187             return true;
0188         }
0189 
0190         return false;
0191     }
0192 
0193     void removeDocument(const quint64 id) {
0194         m_transaction->removeDocument(id);
0195     }
0196 
0197     void commit() {
0198         m_transaction->commit();
0199     }
0200 
0201     void abort() {
0202         m_transaction->abort();
0203     }
0204 
0205 private:
0206     Transaction* m_transaction;
0207 };
0208 }
0209 
0210 using namespace Baloo;
0211 
0212 DatabaseSanitizer::DatabaseSanitizer(const Database& db, Baloo::Transaction::TransactionType type)
0213     : m_pimpl(new DatabaseSanitizerImpl(db, type))
0214 {
0215 }
0216 
0217 DatabaseSanitizer::DatabaseSanitizer(Database* db, Transaction::TransactionType type)
0218     : DatabaseSanitizer(*db, type)
0219 {
0220 }
0221 
0222 DatabaseSanitizer::~DatabaseSanitizer()
0223 {
0224     delete m_pimpl;
0225     m_pimpl = nullptr;
0226 }
0227 
0228 /**
0229 * Create a list of \a FileInfo items and print it to stdout.
0230 *
0231 * \p deviceIDs filter by device ids. If the vector is empty no filtering is done
0232 * and everything is printed.
0233 * Positive numbers are including filters printing only the mentioned device ids.
0234 * Negative numbers are excluding filters printing everything but the mentioned device ids.
0235 *
0236 * \p missingOnly Simulate purging operation. Only inaccessible items are printed.
0237 *
0238 * \p urlFilter Filter result urls. Default is null = Print everything.
0239 */
0240  void DatabaseSanitizer::printList(
0241     const QVector<qint64>& deviceIds,
0242     const ItemAccessFilters accessFilter,
0243     const QSharedPointer<QRegularExpression>& urlFilter)
0244 {
0245     auto listResult = m_pimpl->createList(deviceIds, accessFilter, urlFilter);
0246     const auto sep = QLatin1Char(' ');
0247     QTextStream out(stdout);
0248     QTextStream err(stderr);
0249     for (const auto& info: listResult.first) {
0250         out << QStringLiteral("%1").arg(info.accessible ? "+" : "!")
0251         << sep << QStringLiteral("device: %1").arg(info.deviceId)
0252         << sep << QStringLiteral("inode: %1").arg(info.inode)
0253         << sep << QStringLiteral("url: %1").arg(info.url)
0254         << endl;
0255     }
0256 
0257     const auto& summary = listResult.second;
0258     if (accessFilter & IgnoreAvailable) {
0259         err << i18n("Total: %1, Inaccessible: %2",
0260                     summary.total,
0261                     summary.total - (summary.ignored + summary.accessible)) << endl;
0262     } else {
0263         err << i18n("Total: %1, Ignored: %2, Accessible: %3, Inaccessible: %4",
0264                     summary.total,
0265                     summary.ignored,
0266                     summary.accessible,
0267                     summary.total - (summary.ignored + summary.accessible)) << endl;
0268     }
0269 }
0270 
0271 void DatabaseSanitizer::printDevices(const QVector<qint64>& deviceIds, const ItemAccessFilters accessFilter)
0272 {
0273     auto infos = m_pimpl->createList(deviceIds, accessFilter, nullptr);
0274 
0275     QMap<quint32, quint64> useCount;
0276     for (const auto& info : infos.first) {
0277         useCount[info.deviceId]++;
0278     }
0279 
0280     const auto sep = QLatin1Char(' ');
0281     QTextStream out(stdout);
0282     QTextStream err(stderr);
0283     int matchCount = 0;
0284     for (auto it = useCount.cbegin(); it != useCount.cend(); it++) {
0285         auto id = it.key();
0286         auto info = m_pimpl->getStorageInfo(id);
0287         auto mounted = info.isValid();
0288         if (info.fileSystemType() == QLatin1String("tmpfs")) {
0289             continue;
0290         } else if (mounted && (accessFilter & IgnoreMounted)) {
0291             continue;
0292         } else if (!mounted && (accessFilter & IgnoreUnmounted)) {
0293             continue;
0294         }
0295         matchCount++;
0296         // TODO coloring would be nice, but "...|grep '^!'" does not work with it.
0297         // out << QStringLiteral("%1").arg(dev.mounted ? "+" : "\033[1;31m!")
0298         // Can be done, see: https://code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/global/qlogging.cpp#n263
0299         out << QStringLiteral("%1").arg(mounted ? "+" : "!")
0300             << sep << QStringLiteral("device:%1").arg(id)
0301             << sep << QStringLiteral("[%1:%2]")
0302                 .arg(major(id), 4, 16, QLatin1Char('0'))
0303                 .arg(minor(id), 4, 16, QLatin1Char('0'))
0304             << sep << QStringLiteral("indexed-items:%1").arg(it.value());
0305 
0306         if (mounted) {
0307             out
0308                 << sep << QStringLiteral("fstype:%1").arg(info.fileSystemType().toPercentEncoding().constData())
0309                 << sep << QStringLiteral("device:%1").arg(info.device().constData())
0310                 << sep << QStringLiteral("path:%1").arg(info.rootPath())
0311             ;
0312         }
0313         // TODO: see above
0314         // out << QStringLiteral("\033[0m") << endl;
0315         out << endl;
0316     }
0317 
0318     err << i18n("Found %1 matching in %2 devices", matchCount, useCount.size()) << endl;
0319 }
0320 
0321 void DatabaseSanitizer::removeStaleEntries(const QVector<qint64>& deviceIds,
0322     const DatabaseSanitizer::ItemAccessFilters accessFilter,
0323     const bool dryRun,
0324     const QSharedPointer<QRegularExpression>& urlFilter)
0325 {
0326     auto listResult = m_pimpl->createList(deviceIds, IgnoreAvailable, urlFilter);
0327 
0328     const auto ignoredDevices = m_pimpl->deviceFilters(listResult.first, accessFilter);
0329 
0330     const auto sep = QLatin1Char(' ');
0331     auto& summary = listResult.second;
0332     QTextStream out(stdout);
0333     QTextStream err(stderr);
0334     for (const auto& info: listResult.first) {
0335         if (ignoredDevices[info.deviceId] == true) {
0336             summary.ignored++;
0337         } else {
0338             if (info.isSymLink) {
0339                 out << i18n("IgnoredSymbolicLink:");
0340                 summary.ignored++;
0341             } else {
0342                 m_pimpl->removeDocument(info.id);
0343                 out << i18n("Removing:");
0344             }
0345             out << sep << QStringLiteral("device: %1").arg(info.deviceId)
0346                 << sep << QStringLiteral("inode: %1").arg(info.inode)
0347                 << sep << QStringLiteral("url: %1").arg(info.url)
0348                 << endl;
0349         }
0350     }
0351     if (dryRun) {
0352         m_pimpl->abort();
0353     } else {
0354         m_pimpl->commit();
0355     }
0356     Q_ASSERT(summary.accessible == 0);
0357     err << i18nc("numbers", "Removed: %1, Total: %2, Ignored: %3",
0358                  summary.total - summary.ignored,
0359                  summary.total,
0360                  summary.ignored)
0361         << endl;
0362 }