File indexing completed on 2024-04-28 03:51:47
0001 /* 0002 This file is part of the KDE Baloo Project 0003 SPDX-FileCopyrightText: 2013-2015 Vishesh Handa <vhanda@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL 0006 */ 0007 0008 #include "app.h" 0009 #include "basicindexingjob.h" 0010 #include "result.h" 0011 #include "idutils.h" 0012 #include "transaction.h" 0013 #include "baloodebug.h" 0014 #include "global.h" 0015 0016 #include <QCoreApplication> 0017 0018 #include <QTimer> 0019 #include <QFileInfo> 0020 0021 #include <KFileMetaData/Extractor> 0022 #include <KFileMetaData/MimeUtils> 0023 #include <KIdleTime> 0024 0025 #include <unistd.h> //for STDIN_FILENO 0026 #include <iostream> 0027 0028 using namespace Baloo; 0029 0030 App::App(QObject* parent) 0031 : QObject(parent) 0032 , m_notifyNewData(STDIN_FILENO, QSocketNotifier::Read) 0033 , m_input() 0034 , m_output() 0035 , m_workerPipe(&m_input, &m_output) 0036 , m_tr(nullptr) 0037 { 0038 m_input.open(STDIN_FILENO, QIODevice::ReadOnly | QIODevice::Unbuffered ); 0039 m_output.open(STDOUT_FILENO, QIODevice::WriteOnly | QIODevice::Unbuffered ); 0040 0041 static int s_idleTimeout = 1000 * 60 * 1; // 1 min 0042 m_idleTime = KIdleTime::instance(); 0043 m_idleTime->addIdleTimeout(s_idleTimeout); 0044 connect(m_idleTime, &KIdleTime::resumingFromIdle, this, [this]() { 0045 qCInfo(BALOO) << "Busy, paced indexing"; 0046 m_isBusy = true; 0047 }); 0048 connect(m_idleTime, qOverload<int, int>(&KIdleTime::timeoutReached), this, [this]() { 0049 qCInfo(BALOO) << "Not busy, fast indexing"; 0050 m_isBusy = false; 0051 }); 0052 0053 using WorkerPipe = Baloo::Private::WorkerPipe; 0054 connect(&m_notifyNewData, &QSocketNotifier::activated, &m_workerPipe, &WorkerPipe::processIdData); 0055 connect(&m_workerPipe, &WorkerPipe::newDocumentIds, this, &App::slotNewBatch); 0056 connect(&m_workerPipe, &WorkerPipe::inputEnd, this, &QCoreApplication::quit); 0057 } 0058 0059 void App::slotNewBatch(const QVector<quint64>& ids) 0060 { 0061 m_ids = ids; 0062 0063 Database *db = globalDatabaseInstance(); 0064 if (!db->open(Database::ReadWriteDatabase)) { 0065 qCCritical(BALOO) << "Failed to open the database"; 0066 exit(1); 0067 } 0068 0069 Q_ASSERT(m_tr == nullptr); 0070 0071 if (!m_isBusy) { 0072 m_idleTime->catchNextResumeEvent(); 0073 } 0074 0075 QTimer::singleShot((m_isBusy ? 500 : 0), this, [this, db] () { 0076 // FIXME: The transaction is open for way too long. We should just open it for when we're 0077 // committing the data not during the extraction. 0078 m_tr = new Transaction(db, Transaction::ReadWrite); 0079 processNextFile(); 0080 }); 0081 0082 /** 0083 * A Single Batch seems to be triggering the SocketNotifier more than once 0084 * so we disable it till the batch is done. 0085 */ 0086 m_notifyNewData.setEnabled(false); 0087 } 0088 0089 void App::processNextFile() 0090 { 0091 if (!m_ids.isEmpty()) { 0092 quint64 id = m_ids.takeFirst(); 0093 0094 QString url = QFile::decodeName(m_tr->documentUrl(id)); 0095 if (url.isEmpty() || !QFile::exists(url)) { 0096 m_tr->removeDocument(id); 0097 QTimer::singleShot(0, this, &App::processNextFile); 0098 return; 0099 } 0100 0101 bool indexed = index(m_tr, url, id); 0102 0103 int delay = (m_isBusy && indexed) ? 10 : 0; 0104 QTimer::singleShot(delay, this, &App::processNextFile); 0105 0106 } else { 0107 bool ok = m_tr->commit(); 0108 if (!ok) { 0109 exit(2); 0110 } 0111 delete m_tr; 0112 m_tr = nullptr; 0113 0114 // Enable the SocketNotifier for the next batch 0115 m_notifyNewData.setEnabled(true); 0116 m_workerPipe.batchFinished(); 0117 } 0118 } 0119 0120 bool App::index(Transaction* tr, const QString& url, quint64 id) 0121 { 0122 if (!m_config.shouldBeIndexed(url)) { 0123 // This apparently happens when the config has changed after the document 0124 // was added to the content indexing db 0125 qCDebug(BALOO) << "Found" << url << "in the ContentIndexingDB, although it should be skipped"; 0126 tr->removeDocument(id); 0127 m_workerPipe.urlFailed(url); 0128 return false; 0129 } 0130 0131 // The initial BasicIndexingJob run has been supplied with the file extension 0132 // mimetype only, skip based on the "real" mimetype 0133 QString mimetype = KFileMetaData::MimeUtils::strictMimeType(url, m_mimeDb).name(); 0134 if (!m_config.shouldMimeTypeBeIndexed(mimetype)) { 0135 qCDebug(BALOO) << "Skipping" << url << "- mimetype:" << mimetype; 0136 // FIXME: in case the extension based and content based mimetype differ 0137 // we should update it. 0138 tr->removePhaseOne(id); 0139 m_workerPipe.urlFailed(url); 0140 return false; 0141 } 0142 0143 // HACK: Also, we're ignoring ttext files which are greater tha 10 Mb as we 0144 // have trouble processing them 0145 // 0146 if (mimetype.startsWith(QLatin1String("text/"))) { 0147 QFileInfo fileInfo(url); 0148 if (fileInfo.size() >= 10 * 1024 * 1024) { 0149 qCDebug(BALOO) << "Skipping large " << url << "- mimetype:" << mimetype; 0150 tr->removePhaseOne(id); 0151 m_workerPipe.urlFailed(url); 0152 return false; 0153 } 0154 } 0155 qCDebug(BALOO) << "Indexing" << id << url << mimetype; 0156 m_workerPipe.urlStarted(url); 0157 0158 // We always run the basic indexing again. This is mostly so that the proper 0159 // mimetype is set and we get proper type information. 0160 // The mimetype fetched in the BasicIndexingJob is fast but not accurate 0161 BasicIndexingJob basicIndexer(url, mimetype, BasicIndexingJob::NoLevel); 0162 basicIndexer.index(); 0163 0164 Baloo::Document doc = basicIndexer.document(); 0165 0166 Result result(url, mimetype, KFileMetaData::ExtractionResult::ExtractMetaData | KFileMetaData::ExtractionResult::ExtractPlainText); 0167 result.setDocument(doc); 0168 0169 const QList<KFileMetaData::Extractor*> exList = m_extractorCollection.fetchExtractors(mimetype); 0170 0171 for (KFileMetaData::Extractor* ex : exList) { 0172 ex->extract(&result); 0173 } 0174 0175 result.finish(); 0176 if (doc.id() != id) { 0177 qCWarning(BALOO) << url << "id seems to have changed. Perhaps baloo was not running, and this file was deleted + re-created"; 0178 tr->removeDocument(id); 0179 if (!tr->hasDocument(doc.id())) { 0180 tr->addDocument(result.document()); 0181 } else { 0182 tr->replaceDocument(result.document(), DocumentTerms | DocumentData); 0183 } 0184 } else { 0185 tr->replaceDocument(result.document(), DocumentTerms | DocumentData); 0186 } 0187 tr->removePhaseOne(doc.id()); 0188 m_workerPipe.urlFinished(url); 0189 return true; 0190 } 0191 0192 #include "moc_app.cpp"