File indexing completed on 2024-04-28 15:29:55

0001 /*
0002     This file is part of the KDE libraries
0003     SPDX-FileCopyrightText: 1999 Waldo Bastian <bastian@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-only
0006 */
0007 
0008 #include "ksycoca.h"
0009 #include "ksycocadict_p.h"
0010 #include "ksycocaentry.h"
0011 #include "sycocadebug.h"
0012 #include <kservice.h>
0013 
0014 #include <QBitArray>
0015 #include <QIODevice>
0016 #include <QVector>
0017 
0018 namespace
0019 {
0020 struct string_entry {
0021     string_entry(const QString &_key, const KSycocaEntry::Ptr &_payload)
0022         : hash(0)
0023         , length(_key.length())
0024         , keyStr(_key)
0025         , key(keyStr.unicode())
0026         , payload(_payload)
0027     {
0028     }
0029     uint hash;
0030     const int length;
0031     const QString keyStr;
0032     const QChar *const key; // always points to keyStr.unicode(); just an optimization
0033     const KSycocaEntry::Ptr payload;
0034 };
0035 }
0036 
0037 class KSycocaDictPrivate
0038 {
0039 public:
0040     KSycocaDictPrivate()
0041         : stream(nullptr)
0042         , offset(0)
0043         , hashTableSize(0)
0044     {
0045     }
0046 
0047     ~KSycocaDictPrivate()
0048     {
0049     }
0050 
0051     // Helper for find_string and findMultiString
0052     qint32 offsetForKey(const QString &key) const;
0053 
0054     // Calculate hash - can be used during loading and during saving.
0055     quint32 hashKey(const QString &key) const;
0056 
0057     std::vector<std::unique_ptr<string_entry>> m_stringentries;
0058     QDataStream *stream;
0059     qint64 offset;
0060     quint32 hashTableSize;
0061     QList<qint32> hashList;
0062 };
0063 
0064 KSycocaDict::KSycocaDict()
0065     : d(new KSycocaDictPrivate)
0066 {
0067 }
0068 
0069 KSycocaDict::KSycocaDict(QDataStream *str, int offset)
0070     : d(new KSycocaDictPrivate)
0071 {
0072     d->stream = str;
0073     d->offset = offset;
0074 
0075     quint32 test1;
0076     quint32 test2;
0077     str->device()->seek(offset);
0078     (*str) >> test1 >> test2;
0079     if ((test1 > 0x000fffff) || (test2 > 1024)) {
0080         KSycoca::flagError();
0081         d->hashTableSize = 0;
0082         d->offset = 0;
0083         return;
0084     }
0085 
0086     str->device()->seek(offset);
0087     (*str) >> d->hashTableSize;
0088     (*str) >> d->hashList;
0089     d->offset = str->device()->pos(); // Start of hashtable
0090 }
0091 
0092 KSycocaDict::~KSycocaDict() = default;
0093 
0094 void KSycocaDict::add(const QString &key, const KSycocaEntry::Ptr &payload)
0095 {
0096     if (key.isEmpty()) {
0097         return; // Not allowed (should never happen)
0098     }
0099     if (!payload) {
0100         return; // Not allowed!
0101     }
0102 
0103     d->m_stringentries.push_back(std::make_unique<string_entry>(key, payload));
0104 }
0105 
0106 void KSycocaDict::remove(const QString &key)
0107 {
0108     if (!d) {
0109         return;
0110     }
0111 
0112     auto it = std::find_if(d->m_stringentries.begin(), d->m_stringentries.end(), [&key](const std::unique_ptr<string_entry> &entry) {
0113         return entry->keyStr == key;
0114     });
0115 
0116     if (it != d->m_stringentries.end()) {
0117         d->m_stringentries.erase(it);
0118     } else {
0119         qCDebug(SYCOCA) << "key not found:" << key;
0120     }
0121 }
0122 
0123 int KSycocaDict::find_string(const QString &key) const
0124 {
0125     Q_ASSERT(d);
0126 
0127     // qCDebug(SYCOCA) << QString("KSycocaDict::find_string(%1)").arg(key);
0128     qint32 offset = d->offsetForKey(key);
0129 
0130     // qCDebug(SYCOCA) << QString("offset is %1").arg(offset,8,16);
0131     if (offset == 0) {
0132         return 0;
0133     }
0134 
0135     if (offset > 0) {
0136         return offset; // Positive ID
0137     }
0138 
0139     // Lookup duplicate list.
0140     offset = -offset;
0141 
0142     d->stream->device()->seek(offset);
0143     // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16);
0144 
0145     while (true) {
0146         (*d->stream) >> offset;
0147         if (offset == 0) {
0148             break;
0149         }
0150         QString dupkey;
0151         (*d->stream) >> dupkey;
0152         // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey);
0153         if (dupkey == key) {
0154             return offset;
0155         }
0156     }
0157     // qCDebug(SYCOCA) << "Not found!";
0158 
0159     return 0;
0160 }
0161 
0162 QList<int> KSycocaDict::findMultiString(const QString &key) const
0163 {
0164     qint32 offset = d->offsetForKey(key);
0165     QList<int> offsetList;
0166     if (offset == 0) {
0167         return offsetList;
0168     }
0169 
0170     if (offset > 0) { // Positive ID: one entry found
0171         offsetList.append(offset);
0172         return offsetList;
0173     }
0174 
0175     // Lookup duplicate list.
0176     offset = -offset;
0177 
0178     d->stream->device()->seek(offset);
0179     // qCDebug(SYCOCA) << QString("Looking up duplicate list at %1").arg(offset,8,16);
0180 
0181     while (true) {
0182         (*d->stream) >> offset;
0183         if (offset == 0) {
0184             break;
0185         }
0186         QString dupkey;
0187         (*d->stream) >> dupkey;
0188         // qCDebug(SYCOCA) << QString(">> %1 %2").arg(offset,8,16).arg(dupkey);
0189         if (dupkey == key) {
0190             offsetList.append(offset);
0191         }
0192     }
0193     return offsetList;
0194 }
0195 
0196 uint KSycocaDict::count() const
0197 {
0198     if (!d) {
0199         return 0;
0200     }
0201 
0202     return d->m_stringentries.size();
0203 }
0204 
0205 void KSycocaDict::clear()
0206 {
0207     d.reset();
0208 }
0209 
0210 uint KSycocaDictPrivate::hashKey(const QString &key) const
0211 {
0212     int len = key.length();
0213     uint h = 0;
0214 
0215     for (int i = 0; i < hashList.count(); i++) {
0216         int pos = hashList[i];
0217         if (pos == 0) {
0218             continue;
0219         } else if (pos < 0) {
0220             pos = -pos;
0221             if (pos < len) {
0222                 h = ((h * 13) + (key[len - pos].cell() % 29)) & 0x3ffffff;
0223             }
0224         } else {
0225             pos = pos - 1;
0226             if (pos < len) {
0227                 h = ((h * 13) + (key[pos].cell() % 29)) & 0x3ffffff;
0228             }
0229         }
0230     }
0231     return h;
0232 }
0233 
0234 // If we have the strings
0235 //    hello
0236 //    world
0237 //    kde
0238 // Then we end up with
0239 //    ABCDE
0240 // where A = diversity of 'h' + 'w' + 'k' etc.
0241 // Also, diversity(-2) == 'l'+'l'+'d' (second character from the end)
0242 
0243 // The hasList is used for hashing:
0244 //  hashList = (-2, 1, 3) means that the hash key comes from
0245 //  the 2nd character from the right, then the 1st from the left, then the 3rd from the left.
0246 
0247 // Calculate the diversity of the strings at position 'pos'
0248 // NOTE: this code is slow, it takes 12% of the _overall_ `kbuildsycoca5 --noincremental` running time
0249 static int calcDiversity(std::vector<std::unique_ptr<string_entry>> *stringlist, int inPos, uint sz)
0250 {
0251     if (inPos == 0) {
0252         return 0;
0253     }
0254     QBitArray matrix(sz);
0255     int pos;
0256 
0257     // static const int s_maxItems = 50;
0258     // int numItem = 0;
0259 
0260     if (inPos < 0) {
0261         pos = -inPos;
0262         for (const auto &entryPtr : *stringlist) {
0263             const int rpos = entryPtr->length - pos;
0264             if (rpos > 0) {
0265                 const uint hash = ((entryPtr->hash * 13) + (entryPtr->key[rpos].cell() % 29)) & 0x3ffffff;
0266                 matrix.setBit(hash % sz, true);
0267             }
0268             // if (++numItem == s_maxItems)
0269             //    break;
0270         }
0271     } else {
0272         pos = inPos - 1;
0273         for (const auto &entryPtr : *stringlist) {
0274             if (pos < entryPtr->length) {
0275                 const uint hash = ((entryPtr->hash * 13) + (entryPtr->key[pos].cell() % 29)) & 0x3ffffff;
0276                 matrix.setBit(hash % sz, true);
0277             }
0278             // if (++numItem == s_maxItems)
0279             //    break;
0280         }
0281     }
0282     return matrix.count(true);
0283 }
0284 
0285 //
0286 // Add the diversity of the strings at position 'pos'
0287 static void addDiversity(std::vector<std::unique_ptr<string_entry>> *stringlist, int pos)
0288 {
0289     if (pos == 0) {
0290         return;
0291     }
0292 
0293     if (pos < 0) {
0294         pos = -pos;
0295         for (auto &entryPtr : *stringlist) {
0296             const int rpos = entryPtr->length - pos;
0297             if (rpos > 0) {
0298                 entryPtr->hash = ((entryPtr->hash * 13) + (entryPtr->key[rpos].cell() % 29)) & 0x3fffffff;
0299             }
0300         }
0301     } else {
0302         pos = pos - 1;
0303         for (auto &entryPtr : *stringlist) {
0304             if (pos < entryPtr->length) {
0305                 entryPtr->hash = ((entryPtr->hash * 13) + (entryPtr->key[pos].cell() % 29)) & 0x3fffffff;
0306             }
0307         }
0308     }
0309 }
0310 
0311 void KSycocaDict::save(QDataStream &str)
0312 {
0313     if (count() == 0) {
0314         d->hashTableSize = 0;
0315         d->hashList.clear();
0316         str << d->hashTableSize;
0317         str << d->hashList;
0318         return;
0319     }
0320 
0321     d->offset = str.device()->pos();
0322 
0323     // qCDebug(SYCOCA) << "KSycocaDict:" << count() << "entries.";
0324 
0325     // qCDebug(SYCOCA) << "Calculating hash keys..";
0326 
0327     int maxLength = 0;
0328     // qCDebug(SYCOCA) << "Finding maximum string length";
0329     for (auto &entryPtr : d->m_stringentries) {
0330         entryPtr->hash = 0;
0331         if (entryPtr->length > maxLength) {
0332             maxLength = entryPtr->length;
0333         }
0334     }
0335 
0336     // qCDebug(SYCOCA) << "Max string length=" << maxLength << "existing hashList=" << d->hashList;
0337 
0338     // use "almost prime" number for sz (to calculate diversity) and later
0339     // for the table size of big tables
0340     // int sz = d->stringlist.count()*5-1;
0341     unsigned int sz = count() * 4 + 1;
0342     while (!(((sz % 3) && (sz % 5) && (sz % 7) && (sz % 11) && (sz % 13)))) {
0343         sz += 2;
0344     }
0345 
0346     d->hashList.clear();
0347 
0348     // Times (with warm caches, i.e. after multiple runs)
0349     // kbuildsycoca5 --noincremental  2.83s user 0.20s system 95% cpu 3.187 total
0350     // kbuildsycoca5 --noincremental  2.74s user 0.25s system 93% cpu 3.205 total
0351     // unittest: 0.50-60 msec per iteration / 0.40-50 msec per iteration
0352 
0353     // Now that MimeTypes are not parsed anymore:
0354     // kbuildsycoca5 --noincremental  2.18s user 0.30s system 91% cpu 2.719 total
0355     // kbuildsycoca5 --noincremental  2.07s user 0.34s system 89% cpu 2.681 total
0356 
0357     // If I enabled s_maxItems = 50, it goes down to
0358     // but I don't know if that's a good idea.
0359     // kbuildsycoca5 --noincremental  1.73s user 0.31s system 85% cpu 2.397 total
0360     // kbuildsycoca5 --noincremental  1.84s user 0.29s system 95% cpu 2.230 total
0361 
0362     // try to limit diversity scan by "predicting" positions
0363     // with high diversity
0364     QVector<int> oldvec(maxLength * 2 + 1);
0365     oldvec.fill(0);
0366     int mindiv = 0;
0367     int lastDiv = 0;
0368 
0369     while (true) {
0370         int divsum = 0;
0371         int divnum = 0;
0372 
0373         int maxDiv = 0;
0374         int maxPos = 0;
0375         for (int pos = -maxLength; pos <= maxLength; ++pos) {
0376             // cut off
0377             if (oldvec[pos + maxLength] < mindiv) {
0378                 oldvec[pos + maxLength] = 0;
0379                 continue;
0380             }
0381 
0382             const int diversity = calcDiversity(&(d->m_stringentries), pos, sz);
0383             if (diversity > maxDiv) {
0384                 maxDiv = diversity;
0385                 maxPos = pos;
0386             }
0387             oldvec[pos + maxLength] = diversity;
0388             divsum += diversity;
0389             ++divnum;
0390         }
0391         // arbitrary cut-off value 3/4 of average seems to work
0392         if (divnum) {
0393             mindiv = (3 * divsum) / (4 * divnum);
0394         }
0395 
0396         if (maxDiv <= lastDiv) {
0397             break;
0398         }
0399         // qCDebug(SYCOCA) << "Max Div=" << maxDiv << "at pos" << maxPos;
0400         lastDiv = maxDiv;
0401         addDiversity(&(d->m_stringentries), maxPos);
0402         d->hashList.append(maxPos);
0403     }
0404 
0405     for (auto &entryPtr : d->m_stringentries) {
0406         entryPtr->hash = d->hashKey(entryPtr->keyStr);
0407     }
0408     // fprintf(stderr, "Calculating minimum table size..\n");
0409 
0410     d->hashTableSize = sz;
0411 
0412     // qCDebug(SYCOCA) << "hashTableSize=" << sz << "hashList=" << d->hashList << "oldvec=" << oldvec;
0413 
0414     struct hashtable_entry {
0415         string_entry *entry;
0416         QList<string_entry *> *duplicates;
0417         qint64 duplicate_offset;
0418     };
0419 
0420     hashtable_entry *hashTable = new hashtable_entry[sz];
0421 
0422     // qCDebug(SYCOCA) << "Clearing hashtable...";
0423     for (unsigned int i = 0; i < sz; i++) {
0424         hashTable[i].entry = nullptr;
0425         hashTable[i].duplicates = nullptr;
0426     }
0427 
0428     // qCDebug(SYCOCA) << "Filling hashtable...";
0429     for (const auto &entryPtr : d->m_stringentries) {
0430         // qCDebug(SYCOCA) << "entry keyStr=" << entry->keyStr << entry->payload.data() << entry->payload->entryPath();
0431         const int hash = entryPtr->hash % sz;
0432         if (!hashTable[hash].entry) {
0433             // First entry
0434             hashTable[hash].entry = entryPtr.get();
0435         } else {
0436             if (!hashTable[hash].duplicates) {
0437                 // Second entry, build duplicate list.
0438                 hashTable[hash].duplicates = new QList<string_entry *>;
0439                 hashTable[hash].duplicates->append(hashTable[hash].entry);
0440                 hashTable[hash].duplicate_offset = 0;
0441             }
0442             hashTable[hash].duplicates->append(entryPtr.get());
0443         }
0444     }
0445 
0446     str << d->hashTableSize;
0447     str << d->hashList;
0448 
0449     d->offset = str.device()->pos(); // d->offset points to start of hashTable
0450     // qCDebug(SYCOCA) << QString("Start of Hash Table, offset = %1").arg(d->offset,8,16);
0451 
0452     // Write the hashtable + the duplicates twice.
0453     // The duplicates are after the normal hashtable, but the offset of each
0454     // duplicate entry is written into the normal hashtable.
0455     for (int pass = 1; pass <= 2; pass++) {
0456         str.device()->seek(d->offset);
0457         // qCDebug(SYCOCA) << QString("Writing hash table (pass #%1)").arg(pass);
0458         for (uint i = 0; i < d->hashTableSize; i++) {
0459             qint32 tmpid;
0460             if (!hashTable[i].entry) {
0461                 tmpid = 0;
0462             } else if (!hashTable[i].duplicates) {
0463                 tmpid = hashTable[i].entry->payload->offset(); // Positive ID
0464             } else {
0465                 tmpid = -hashTable[i].duplicate_offset; // Negative ID
0466             }
0467             str << tmpid;
0468             // qCDebug(SYCOCA) << QString("Hash table : %1").arg(tmpid,8,16);
0469         }
0470         // qCDebug(SYCOCA) << QString("End of Hash Table, offset = %1").arg(str.device()->at(),8,16);
0471 
0472         // qCDebug(SYCOCA) << QString("Writing duplicate lists (pass #%1)").arg(pass);
0473         for (uint i = 0; i < d->hashTableSize; i++) {
0474             const QList<string_entry *> *dups = hashTable[i].duplicates;
0475             if (dups) {
0476                 hashTable[i].duplicate_offset = str.device()->pos();
0477 
0478                 /*qCDebug(SYCOCA) << QString("Duplicate lists: Offset = %1 list_size = %2") .arg(hashTable[i].duplicate_offset,8,16).arg(dups->count());
0479                  */
0480                 for (string_entry *dup : std::as_const(*dups)) {
0481                     const qint32 offset = dup->payload->offset();
0482                     if (!offset) {
0483                         const QString storageId = dup->payload->storageId();
0484                         qCDebug(SYCOCA) << "about to assert! dict=" << this << "storageId=" << storageId << dup->payload.data();
0485                         if (dup->payload->isType(KST_KService)) {
0486                             KService::Ptr service(static_cast<KService *>(dup->payload.data()));
0487                             qCDebug(SYCOCA) << service->storageId() << service->entryPath();
0488                         }
0489                         // save() must have been called on the entry
0490                         Q_ASSERT_X(offset,
0491                                    "KSycocaDict::save",
0492                                    QByteArray("entry offset is 0, save() was not called on " + dup->payload->storageId().toLatin1()
0493                                               + " entryPath=" + dup->payload->entryPath().toLatin1())
0494                                        .constData());
0495                     }
0496                     str << offset; // Positive ID
0497                     str << dup->keyStr; // Key (QString)
0498                 }
0499                 str << qint32(0); // End of list marker (0)
0500             }
0501         }
0502         // qCDebug(SYCOCA) << QString("End of Dict, offset = %1").arg(str.device()->at(),8,16);
0503     }
0504 
0505     // qCDebug(SYCOCA) << "Cleaning up hash table.";
0506     for (uint i = 0; i < d->hashTableSize; i++) {
0507         delete hashTable[i].duplicates;
0508     }
0509     delete[] hashTable;
0510 }
0511 
0512 qint32 KSycocaDictPrivate::offsetForKey(const QString &key) const
0513 {
0514     if (!stream || !offset) {
0515         qCWarning(SYCOCA) << "No ksycoca database available! Tried running" << KBUILDSYCOCA_EXENAME << "?";
0516         return 0;
0517     }
0518 
0519     if (hashTableSize == 0) {
0520         return 0; // Unlikely to find anything :-]
0521     }
0522 
0523     // Read hash-table data
0524     const uint hash = hashKey(key) % hashTableSize;
0525     // qCDebug(SYCOCA) << "hash is" << hash;
0526 
0527     const qint64 off = offset + sizeof(qint32) * hash;
0528     // qCDebug(SYCOCA) << QString("off is %1").arg(off,8,16);
0529     stream->device()->seek(off);
0530 
0531     qint32 retOffset;
0532     (*stream) >> retOffset;
0533     return retOffset;
0534 }