File indexing completed on 2024-10-06 09:41:50
0001 /* 0002 This file is part of the KDE libraries 0003 SPDX-FileCopyrightText: 1999 Waldo Bastian <bastian@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.0-only 0006 */ 0007 0008 #include "ksycoca.h" 0009 #include "ksycocadict_p.h" 0010 #include "ksycocaentry.h" 0011 #include "sycocadebug.h" 0012 #include <kservice.h> 0013 0014 #include <QBitArray> 0015 #include <QIODevice> 0016 #include <QVector> 0017 0018 namespace 0019 { 0020 struct string_entry { 0021 string_entry(const QString &_key, const KSycocaEntry::Ptr &_payload) 0022 : hash(0) 0023 , length(_key.length()) 0024 , keyStr(_key) 0025 , key(keyStr.unicode()) 0026 , payload(_payload) 0027 { 0028 } 0029 uint hash; 0030 const int length; 0031 const QString keyStr; 0032 const QChar *const key; // always points to keyStr.unicode(); just an optimization 0033 const KSycocaEntry::Ptr payload; 0034 }; 0035 } 0036 0037 class KSycocaDictPrivate 0038 { 0039 public: 0040 KSycocaDictPrivate() 0041 : stream(nullptr) 0042 , offset(0) 0043 , hashTableSize(0) 0044 { 0045 } 0046 0047 ~KSycocaDictPrivate() 0048 { 0049 } 0050 0051 // Helper for find_string and findMultiString 0052 qint32 offsetForKey(const QString &key) const; 0053 0054 // Calculate hash - can be used during loading and during saving. 0055 quint32 hashKey(const QString &key) const; 0056 0057 std::vector<std::unique_ptr<string_entry>> m_stringentries; 0058 QDataStream *stream; 0059 qint64 offset; 0060 quint32 hashTableSize; 0061 QList<qint32> hashList; 0062 }; 0063 0064 KSycocaDict::KSycocaDict() 0065 : d(new KSycocaDictPrivate) 0066 { 0067 } 0068 0069 KSycocaDict::KSycocaDict(QDataStream *str, int offset) 0070 : d(new KSycocaDictPrivate) 0071 { 0072 d->stream = str; 0073 d->offset = offset; 0074 0075 quint32 test1; 0076 quint32 test2; 0077 str->device()->seek(offset); 0078 (*str) >> test1 >> test2; 0079 if ((test1 > 0x000fffff) || (test2 > 1024)) { 0080 KSycoca::flagError(); 0081 d->hashTableSize = 0; 0082 d->offset = 0; 0083 return; 0084 } 0085 0086 str->device()->seek(offset); 0087 (*str) >> d->hashTableSize; 0088 (*str) >> d->hashList; 0089 d->offset = str->device()->pos(); // Start of hashtable 0090 } 0091 0092 KSycocaDict::~KSycocaDict() = default; 0093 0094 void KSycocaDict::add(const QString &key, const KSycocaEntry::Ptr &payload) 0095 { 0096 if (key.isEmpty()) { 0097 return; // Not allowed (should never happen) 0098 } 0099 if (!payload) { 0100 return; // Not allowed! 0101 } 0102 0103 d->m_stringentries.push_back(std::make_unique<string_entry>(key, payload)); 0104 } 0105 0106 void KSycocaDict::remove(const QString &key) 0107 { 0108 if (!d) { 0109 return; 0110 } 0111 0112 auto it = std::find_if(d->m_stringentries.begin(), d->m_stringentries.end(), [&key](const std::unique_ptr<string_entry> &entry) { 0113 return entry->keyStr == key; 0114 }); 0115 0116 if (it != d->m_stringentries.end()) { 0117 d->m_stringentries.erase(it); 0118 } else { 0119 qCDebug(SYCOCA) << "key not found:" << key; 0120 } 0121 } 0122 0123 int KSycocaDict::find_string(const QString &key) const 0124 { 0125 Q_ASSERT(d); 0126 0127 // qCDebug(SYCOCA) << QString("KSycocaDict::find_string(%1)").arg(key); 0128 qint32 offset = d->offsetForKey(key); 0129 0130 // qCDebug(SYCOCA) << QString("offset is %1").arg(offset,8,16); 0131 if (offset == 0) { 0132 return 0; 0133 } 0134 0135 if (offset > 0) { 0136 return offset; // Positive ID 0137 } 0138 0139 // Lookup duplicate list. 0140 offset = -offset; 0141 0142 d->stream->device()->seek(offset); 0143 // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16); 0144 0145 while (true) { 0146 (*d->stream) >> offset; 0147 if (offset == 0) { 0148 break; 0149 } 0150 QString dupkey; 0151 (*d->stream) >> dupkey; 0152 // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey); 0153 if (dupkey == key) { 0154 return offset; 0155 } 0156 } 0157 // qCDebug(SYCOCA) << "Not found!"; 0158 0159 return 0; 0160 } 0161 0162 QList<int> KSycocaDict::findMultiString(const QString &key) const 0163 { 0164 qint32 offset = d->offsetForKey(key); 0165 QList<int> offsetList; 0166 if (offset == 0) { 0167 return offsetList; 0168 } 0169 0170 if (offset > 0) { // Positive ID: one entry found 0171 offsetList.append(offset); 0172 return offsetList; 0173 } 0174 0175 // Lookup duplicate list. 0176 offset = -offset; 0177 0178 d->stream->device()->seek(offset); 0179 // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16); 0180 0181 while (true) { 0182 (*d->stream) >> offset; 0183 if (offset == 0) { 0184 break; 0185 } 0186 QString dupkey; 0187 (*d->stream) >> dupkey; 0188 // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey); 0189 if (dupkey == key) { 0190 offsetList.append(offset); 0191 } 0192 } 0193 return offsetList; 0194 } 0195 0196 uint KSycocaDict::count() const 0197 { 0198 if (!d) { 0199 return 0; 0200 } 0201 0202 return d->m_stringentries.size(); 0203 } 0204 0205 void KSycocaDict::clear() 0206 { 0207 d.reset(); 0208 } 0209 0210 uint KSycocaDictPrivate::hashKey(const QString &key) const 0211 { 0212 int len = key.length(); 0213 uint h = 0; 0214 0215 for (int i = 0; i < hashList.count(); i++) { 0216 int pos = hashList[i]; 0217 if (pos == 0) { 0218 continue; 0219 } else if (pos < 0) { 0220 pos = -pos; 0221 if (pos < len) { 0222 h = ((h * 13) + (key[len - pos].cell() % 29)) & 0x3ffffff; 0223 } 0224 } else { 0225 pos = pos - 1; 0226 if (pos < len) { 0227 h = ((h * 13) + (key[pos].cell() % 29)) & 0x3ffffff; 0228 } 0229 } 0230 } 0231 return h; 0232 } 0233 0234 // If we have the strings 0235 // hello 0236 // world 0237 // kde 0238 // Then we end up with 0239 // ABCDE 0240 // where A = diversity of 'h' + 'w' + 'k' etc. 0241 // Also, diversity(-2) == 'l'+'l'+'d' (second character from the end) 0242 0243 // The hasList is used for hashing: 0244 // hashList = (-2, 1, 3) means that the hash key comes from 0245 // the 2nd character from the right, then the 1st from the left, then the 3rd from the left. 0246 0247 // Calculate the diversity of the strings at position 'pos' 0248 // NOTE: this code is slow, it takes 12% of the _overall_ `kbuildsycoca5 --noincremental` running time 0249 static int calcDiversity(std::vector<std::unique_ptr<string_entry>> *stringlist, int inPos, uint sz) 0250 { 0251 if (inPos == 0) { 0252 return 0; 0253 } 0254 QBitArray matrix(sz); 0255 int pos; 0256 0257 // static const int s_maxItems = 50; 0258 // int numItem = 0; 0259 0260 if (inPos < 0) { 0261 pos = -inPos; 0262 for (const auto &entryPtr : *stringlist) { 0263 const int rpos = entryPtr->length - pos; 0264 if (rpos > 0) { 0265 const uint hash = ((entryPtr->hash * 13) + (entryPtr->key[rpos].cell() % 29)) & 0x3ffffff; 0266 matrix.setBit(hash % sz, true); 0267 } 0268 // if (++numItem == s_maxItems) 0269 // break; 0270 } 0271 } else { 0272 pos = inPos - 1; 0273 for (const auto &entryPtr : *stringlist) { 0274 if (pos < entryPtr->length) { 0275 const uint hash = ((entryPtr->hash * 13) + (entryPtr->key[pos].cell() % 29)) & 0x3ffffff; 0276 matrix.setBit(hash % sz, true); 0277 } 0278 // if (++numItem == s_maxItems) 0279 // break; 0280 } 0281 } 0282 return matrix.count(true); 0283 } 0284 0285 // 0286 // Add the diversity of the strings at position 'pos' 0287 static void addDiversity(std::vector<std::unique_ptr<string_entry>> *stringlist, int pos) 0288 { 0289 if (pos == 0) { 0290 return; 0291 } 0292 0293 if (pos < 0) { 0294 pos = -pos; 0295 for (auto &entryPtr : *stringlist) { 0296 const int rpos = entryPtr->length - pos; 0297 if (rpos > 0) { 0298 entryPtr->hash = ((entryPtr->hash * 13) + (entryPtr->key[rpos].cell() % 29)) & 0x3fffffff; 0299 } 0300 } 0301 } else { 0302 pos = pos - 1; 0303 for (auto &entryPtr : *stringlist) { 0304 if (pos < entryPtr->length) { 0305 entryPtr->hash = ((entryPtr->hash * 13) + (entryPtr->key[pos].cell() % 29)) & 0x3fffffff; 0306 } 0307 } 0308 } 0309 } 0310 0311 void KSycocaDict::save(QDataStream &str) 0312 { 0313 if (count() == 0) { 0314 d->hashTableSize = 0; 0315 d->hashList.clear(); 0316 str << d->hashTableSize; 0317 str << d->hashList; 0318 return; 0319 } 0320 0321 d->offset = str.device()->pos(); 0322 0323 // qCDebug(SYCOCA) << "KSycocaDict:" << count() << "entries."; 0324 0325 // qCDebug(SYCOCA) << "Calculating hash keys.."; 0326 0327 int maxLength = 0; 0328 // qCDebug(SYCOCA) << "Finding maximum string length"; 0329 for (auto &entryPtr : d->m_stringentries) { 0330 entryPtr->hash = 0; 0331 if (entryPtr->length > maxLength) { 0332 maxLength = entryPtr->length; 0333 } 0334 } 0335 0336 // qCDebug(SYCOCA) << "Max string length=" << maxLength << "existing hashList=" << d->hashList; 0337 0338 // use "almost prime" number for sz (to calculate diversity) and later 0339 // for the table size of big tables 0340 // int sz = d->stringlist.count()*5-1; 0341 unsigned int sz = count() * 4 + 1; 0342 while (!(((sz % 3) && (sz % 5) && (sz % 7) && (sz % 11) && (sz % 13)))) { 0343 sz += 2; 0344 } 0345 0346 d->hashList.clear(); 0347 0348 // Times (with warm caches, i.e. after multiple runs) 0349 // kbuildsycoca5 --noincremental 2.83s user 0.20s system 95% cpu 3.187 total 0350 // kbuildsycoca5 --noincremental 2.74s user 0.25s system 93% cpu 3.205 total 0351 // unittest: 0.50-60 msec per iteration / 0.40-50 msec per iteration 0352 0353 // Now that MimeTypes are not parsed anymore: 0354 // kbuildsycoca5 --noincremental 2.18s user 0.30s system 91% cpu 2.719 total 0355 // kbuildsycoca5 --noincremental 2.07s user 0.34s system 89% cpu 2.681 total 0356 0357 // If I enabled s_maxItems = 50, it goes down to 0358 // but I don't know if that's a good idea. 0359 // kbuildsycoca5 --noincremental 1.73s user 0.31s system 85% cpu 2.397 total 0360 // kbuildsycoca5 --noincremental 1.84s user 0.29s system 95% cpu 2.230 total 0361 0362 // try to limit diversity scan by "predicting" positions 0363 // with high diversity 0364 QVector<int> oldvec(maxLength * 2 + 1); 0365 oldvec.fill(0); 0366 int mindiv = 0; 0367 int lastDiv = 0; 0368 0369 while (true) { 0370 int divsum = 0; 0371 int divnum = 0; 0372 0373 int maxDiv = 0; 0374 int maxPos = 0; 0375 for (int pos = -maxLength; pos <= maxLength; ++pos) { 0376 // cut off 0377 if (oldvec[pos + maxLength] < mindiv) { 0378 oldvec[pos + maxLength] = 0; 0379 continue; 0380 } 0381 0382 const int diversity = calcDiversity(&(d->m_stringentries), pos, sz); 0383 if (diversity > maxDiv) { 0384 maxDiv = diversity; 0385 maxPos = pos; 0386 } 0387 oldvec[pos + maxLength] = diversity; 0388 divsum += diversity; 0389 ++divnum; 0390 } 0391 // arbitrary cut-off value 3/4 of average seems to work 0392 if (divnum) { 0393 mindiv = (3 * divsum) / (4 * divnum); 0394 } 0395 0396 if (maxDiv <= lastDiv) { 0397 break; 0398 } 0399 // qCDebug(SYCOCA) << "Max Div=" << maxDiv << "at pos" << maxPos; 0400 lastDiv = maxDiv; 0401 addDiversity(&(d->m_stringentries), maxPos); 0402 d->hashList.append(maxPos); 0403 } 0404 0405 for (auto &entryPtr : d->m_stringentries) { 0406 entryPtr->hash = d->hashKey(entryPtr->keyStr); 0407 } 0408 // fprintf(stderr, "Calculating minimum table size..\n"); 0409 0410 d->hashTableSize = sz; 0411 0412 // qCDebug(SYCOCA) << "hashTableSize=" << sz << "hashList=" << d->hashList << "oldvec=" << oldvec; 0413 0414 struct hashtable_entry { 0415 string_entry *entry; 0416 QList<string_entry *> *duplicates; 0417 qint64 duplicate_offset; 0418 }; 0419 0420 hashtable_entry *hashTable = new hashtable_entry[sz]; 0421 0422 // qCDebug(SYCOCA) << "Clearing hashtable..."; 0423 for (unsigned int i = 0; i < sz; i++) { 0424 hashTable[i].entry = nullptr; 0425 hashTable[i].duplicates = nullptr; 0426 } 0427 0428 // qCDebug(SYCOCA) << "Filling hashtable..."; 0429 for (const auto &entryPtr : d->m_stringentries) { 0430 // qCDebug(SYCOCA) << "entry keyStr=" << entry->keyStr << entry->payload.data() << entry->payload->entryPath(); 0431 const int hash = entryPtr->hash % sz; 0432 if (!hashTable[hash].entry) { 0433 // First entry 0434 hashTable[hash].entry = entryPtr.get(); 0435 } else { 0436 if (!hashTable[hash].duplicates) { 0437 // Second entry, build duplicate list. 0438 hashTable[hash].duplicates = new QList<string_entry *>; 0439 hashTable[hash].duplicates->append(hashTable[hash].entry); 0440 hashTable[hash].duplicate_offset = 0; 0441 } 0442 hashTable[hash].duplicates->append(entryPtr.get()); 0443 } 0444 } 0445 0446 str << d->hashTableSize; 0447 str << d->hashList; 0448 0449 d->offset = str.device()->pos(); // d->offset points to start of hashTable 0450 // qCDebug(SYCOCA) << QString("Start of Hash Table, offset = %1").arg(d->offset,8,16); 0451 0452 // Write the hashtable + the duplicates twice. 0453 // The duplicates are after the normal hashtable, but the offset of each 0454 // duplicate entry is written into the normal hashtable. 0455 for (int pass = 1; pass <= 2; pass++) { 0456 str.device()->seek(d->offset); 0457 // qCDebug(SYCOCA) << QString("Writing hash table (pass #%1)").arg(pass); 0458 for (uint i = 0; i < d->hashTableSize; i++) { 0459 qint32 tmpid; 0460 if (!hashTable[i].entry) { 0461 tmpid = 0; 0462 } else if (!hashTable[i].duplicates) { 0463 tmpid = hashTable[i].entry->payload->offset(); // Positive ID 0464 } else { 0465 tmpid = -hashTable[i].duplicate_offset; // Negative ID 0466 } 0467 str << tmpid; 0468 // qCDebug(SYCOCA) << QString("Hash table : %1").arg(tmpid,8,16); 0469 } 0470 // qCDebug(SYCOCA) << QString("End of Hash Table, offset = %1").arg(str.device()->at(),8,16); 0471 0472 // qCDebug(SYCOCA) << QString("Writing duplicate lists (pass #%1)").arg(pass); 0473 for (uint i = 0; i < d->hashTableSize; i++) { 0474 const QList<string_entry *> *dups = hashTable[i].duplicates; 0475 if (dups) { 0476 hashTable[i].duplicate_offset = str.device()->pos(); 0477 0478 /*qCDebug(SYCOCA) << QString("Duplicate lists: Offset = %1 list_size = %2") .arg(hashTable[i].duplicate_offset,8,16).arg(dups->count()); 0479 */ 0480 for (string_entry *dup : std::as_const(*dups)) { 0481 const qint32 offset = dup->payload->offset(); 0482 if (!offset) { 0483 const QString storageId = dup->payload->storageId(); 0484 qCDebug(SYCOCA) << "about to assert! dict=" << this << "storageId=" << storageId << dup->payload.data(); 0485 if (dup->payload->isType(KST_KService)) { 0486 KService::Ptr service(static_cast<KService *>(dup->payload.data())); 0487 qCDebug(SYCOCA) << service->storageId() << service->entryPath(); 0488 } 0489 // save() must have been called on the entry 0490 Q_ASSERT_X(offset, 0491 "KSycocaDict::save", 0492 QByteArray("entry offset is 0, save() was not called on " + dup->payload->storageId().toLatin1() 0493 + " entryPath=" + dup->payload->entryPath().toLatin1()) 0494 .constData()); 0495 } 0496 str << offset; // Positive ID 0497 str << dup->keyStr; // Key (QString) 0498 } 0499 str << qint32(0); // End of list marker (0) 0500 } 0501 } 0502 // qCDebug(SYCOCA) << QString("End of Dict, offset = %1").arg(str.device()->at(),8,16); 0503 } 0504 0505 // qCDebug(SYCOCA) << "Cleaning up hash table."; 0506 for (uint i = 0; i < d->hashTableSize; i++) { 0507 delete hashTable[i].duplicates; 0508 } 0509 delete[] hashTable; 0510 } 0511 0512 qint32 KSycocaDictPrivate::offsetForKey(const QString &key) const 0513 { 0514 if (!stream || !offset) { 0515 qCWarning(SYCOCA) << "No ksycoca database available! Tried running" << KBUILDSYCOCA_EXENAME << "?"; 0516 return 0; 0517 } 0518 0519 if (hashTableSize == 0) { 0520 return 0; // Unlikely to find anything :-] 0521 } 0522 0523 // Read hash-table data 0524 const uint hash = hashKey(key) % hashTableSize; 0525 // qCDebug(SYCOCA) << "hash is" << hash; 0526 0527 const qint64 off = offset + sizeof(qint32) * hash; 0528 // qCDebug(SYCOCA) << QString("off is %1").arg(off,8,16); 0529 stream->device()->seek(off); 0530 0531 qint32 retOffset; 0532 (*stream) >> retOffset; 0533 return retOffset; 0534 }