File indexing completed on 2024-04-28 03:51:47

0001 /*
0002     This file is part of the KDE Baloo Project
0003     SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
0006 */
0007 
0008 #include "app.h"
0009 #include "basicindexingjob.h"
0010 #include "result.h"
0011 #include "idutils.h"
0012 #include "transaction.h"
0013 #include "baloodebug.h"
0014 #include "global.h"
0015 
0016 #include <QCoreApplication>
0017 
0018 #include <QTimer>
0019 #include <QFileInfo>
0020 
0021 #include <KFileMetaData/Extractor>
0022 #include <KFileMetaData/MimeUtils>
0023 #include <KIdleTime>
0024 
0025 #include <unistd.h> //for STDIN_FILENO
0026 #include <iostream>
0027 
0028 using namespace Baloo;
0029 
0030 App::App(QObject* parent)
0031     : QObject(parent)
0032     , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read)
0033     , m_input()
0034     , m_output()
0035     , m_workerPipe(&m_input, &m_output)
0036     , m_tr(nullptr)
0037 {
0038     m_input.open(STDIN_FILENO, QIODevice::ReadOnly | QIODevice::Unbuffered );
0039     m_output.open(STDOUT_FILENO, QIODevice::WriteOnly | QIODevice::Unbuffered );
0040 
0041     static int s_idleTimeout = 1000 * 60 * 1; // 1 min
0042     m_idleTime = KIdleTime::instance();
0043     m_idleTime->addIdleTimeout(s_idleTimeout);
0044     connect(m_idleTime, &KIdleTime::resumingFromIdle, this, [this]() {
0045         qCInfo(BALOO) << "Busy, paced indexing";
0046         m_isBusy = true;
0047     });
0048     connect(m_idleTime, qOverload<int, int>(&KIdleTime::timeoutReached), this, [this]() {
0049         qCInfo(BALOO) << "Not busy, fast indexing";
0050         m_isBusy = false;
0051     });
0052 
0053     using WorkerPipe = Baloo::Private::WorkerPipe;
0054     connect(&m_notifyNewData, &QSocketNotifier::activated, &m_workerPipe, &WorkerPipe::processIdData);
0055     connect(&m_workerPipe, &WorkerPipe::newDocumentIds, this, &App::slotNewBatch);
0056     connect(&m_workerPipe, &WorkerPipe::inputEnd, this, &QCoreApplication::quit);
0057 }
0058 
0059 void App::slotNewBatch(const QVector<quint64>& ids)
0060 {
0061     m_ids = ids;
0062 
0063     Database *db = globalDatabaseInstance();
0064     if (!db->open(Database::ReadWriteDatabase)) {
0065         qCCritical(BALOO) << "Failed to open the database";
0066         exit(1);
0067     }
0068 
0069     Q_ASSERT(m_tr == nullptr);
0070 
0071     if (!m_isBusy) {
0072         m_idleTime->catchNextResumeEvent();
0073     }
0074 
0075     QTimer::singleShot((m_isBusy ? 500 : 0), this, [this, db] () {
0076         // FIXME: The transaction is open for way too long. We should just open it for when we're
0077         //        committing the data not during the extraction.
0078         m_tr = new Transaction(db, Transaction::ReadWrite);
0079         processNextFile();
0080     });
0081 
0082     /**
0083      * A Single Batch seems to be triggering the SocketNotifier more than once
0084      * so we disable it till the batch is done.
0085      */
0086     m_notifyNewData.setEnabled(false);
0087 }
0088 
0089 void App::processNextFile()
0090 {
0091     if (!m_ids.isEmpty()) {
0092         quint64 id = m_ids.takeFirst();
0093 
0094         QString url = QFile::decodeName(m_tr->documentUrl(id));
0095         if (url.isEmpty() || !QFile::exists(url)) {
0096             m_tr->removeDocument(id);
0097             QTimer::singleShot(0, this, &App::processNextFile);
0098             return;
0099         }
0100 
0101         bool indexed = index(m_tr, url, id);
0102 
0103         int delay = (m_isBusy && indexed) ? 10 : 0;
0104         QTimer::singleShot(delay, this, &App::processNextFile);
0105 
0106     } else {
0107         bool ok = m_tr->commit();
0108         if (!ok) {
0109             exit(2);
0110         }
0111         delete m_tr;
0112         m_tr = nullptr;
0113 
0114         // Enable the SocketNotifier for the next batch
0115         m_notifyNewData.setEnabled(true);
0116         m_workerPipe.batchFinished();
0117     }
0118 }
0119 
0120 bool App::index(Transaction* tr, const QString& url, quint64 id)
0121 {
0122     if (!m_config.shouldBeIndexed(url)) {
0123         // This apparently happens when the config has changed after the document
0124         // was added to the content indexing db
0125         qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped";
0126         tr->removeDocument(id);
0127         m_workerPipe.urlFailed(url);
0128         return false;
0129     }
0130 
0131     // The initial BasicIndexingJob run has been supplied with the file extension
0132     // mimetype only, skip based on the "real" mimetype
0133     QString mimetype = KFileMetaData::MimeUtils::strictMimeType(url, m_mimeDb).name();
0134     if (!m_config.shouldMimeTypeBeIndexed(mimetype)) {
0135         qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype;
0136         // FIXME: in case the extension based and content based mimetype differ
0137         // we should update it.
0138         tr->removePhaseOne(id);
0139         m_workerPipe.urlFailed(url);
0140         return false;
0141     }
0142 
0143     // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we
0144     // have trouble processing them
0145     //
0146     if (mimetype.startsWith(QLatin1String("text/"))) {
0147         QFileInfo fileInfo(url);
0148         if (fileInfo.size() >= 10 * 1024 * 1024) {
0149             qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype;
0150             tr->removePhaseOne(id);
0151             m_workerPipe.urlFailed(url);
0152             return false;
0153         }
0154     }
0155     qCDebug(BALOO) << "Indexing" << id << url << mimetype;
0156     m_workerPipe.urlStarted(url);
0157 
0158     // We always run the basic indexing again. This is mostly so that the proper
0159     // mimetype is set and we get proper type information.
0160     // The mimetype fetched in the BasicIndexingJob is fast but not accurate
0161     BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel);
0162     basicIndexer.index();
0163 
0164     Baloo::Document doc = basicIndexer.document();
0165 
0166     Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText);
0167     result.setDocument(doc);
0168 
0169     const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype);
0170 
0171     for (KFileMetaData::Extractor* ex : exList) {
0172         ex->extract(&result);
0173     }
0174 
0175     result.finish();
0176     if (doc.id() != id) {
0177         qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created";
0178         tr->removeDocument(id);
0179         if (!tr->hasDocument(doc.id())) {
0180             tr->addDocument(result.document());
0181         } else {
0182             tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
0183         }
0184     } else {
0185         tr->replaceDocument(result.document(), DocumentTerms | DocumentData);
0186     }
0187     tr->removePhaseOne(doc.id());
0188     m_workerPipe.urlFinished(url);
0189     return true;
0190 }
0191 
0192 #include "moc_app.cpp"