File indexing completed on 2024-04-28 05:49:36

0001 /*
0002     SPDX-FileCopyrightText: 2017 Forrest Smith
0003     SPDX-FileCopyrightText: 2020 Waqar Ahmed
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #pragma once
0009 
0010 #include <QDebug>
0011 #include <QString>
0012 #include <QTextLayout>
0013 
0014 /**
0015  * This is based on https://github.com/forrestthewoods/lib_fts/blob/master/code/fts_fuzzy_match.h
0016  * with modifications for Qt
0017  *
0018  * Dont include this file in a header file, please :)
0019  */
0020 
0021 namespace kfts
0022 {
0023 /**
0024  * @brief simple fuzzy matching of chars in @a pattern with chars in @a str sequentially
0025  */
0026 Q_DECL_UNUSED static bool fuzzy_match_simple(const QStringView pattern, const QStringView str);
0027 
0028 /**
0029  * @brief This should be the main function you should use. @a outscore is the score
0030  * of this match and should be used to sort the results later. Without sorting of the
0031  * results this function won't be as effective.
0032  */
0033 Q_DECL_UNUSED static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore);
0034 Q_DECL_UNUSED static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore, uint8_t *matches);
0035 
0036 /**
0037  * @brief get string for display in treeview / listview. This should be used from style delegate.
0038  * For example: with @a pattern = "kate", @a str = "kateapp" and @htmlTag = "<b>
0039  * the output will be <b>k</b><b>a</b><b>t</b><b>e</b>app which will be visible to user as
0040  * <b>kate</b>app.
0041  *
0042  * TODO: improve this so that we don't have to put html tags on every char probably using some kind
0043  * of interval container
0044  */
0045 Q_DECL_UNUSED static QString to_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose);
0046 Q_DECL_UNUSED static QString
0047 to_scored_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose);
0048 }
0049 
0050 namespace kfts
0051 {
0052 // Forward declarations for "private" implementation
0053 namespace fuzzy_internal
0054 {
0055 static inline constexpr QChar toLower(QChar c)
0056 {
0057     return c.isLower() ? c : c.toLower();
0058 }
0059 
0060 static bool fuzzy_match_recursive(QStringView::const_iterator pattern,
0061                                   QStringView::const_iterator str,
0062                                   int &outScore,
0063                                   const QStringView::const_iterator strBegin,
0064                                   const QStringView::const_iterator strEnd,
0065                                   const QStringView::const_iterator patternEnd,
0066                                   uint8_t const *srcMatches,
0067                                   uint8_t *newMatches,
0068                                   int nextMatch,
0069                                   int &totalMatches,
0070                                   int &recursionCount);
0071 }
0072 
0073 // Public interface
0074 static bool fuzzy_match_simple(const QStringView pattern, const QStringView str)
0075 {
0076     if (pattern.length() == 1) {
0077         return str.contains(pattern, Qt::CaseInsensitive);
0078     }
0079 
0080     auto patternIt = pattern.cbegin();
0081     for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) {
0082         if (strIt->toLower() == patternIt->toLower()) {
0083             ++patternIt;
0084         }
0085     }
0086     return patternIt == pattern.cend();
0087 }
0088 
0089 static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore)
0090 {
0091     if (pattern.length() == 1) {
0092         const int found = str.indexOf(pattern, Qt::CaseInsensitive);
0093         if (found >= 0) {
0094             outScore = 250 - found;
0095             outScore += pattern.at(0) == str.at(found);
0096             return true;
0097         } else {
0098             outScore = 0;
0099             return false;
0100         }
0101     }
0102 
0103     // simple substring matching to flush out non-matching stuff
0104     auto patternIt = pattern.cbegin();
0105     bool lower = patternIt->isLower();
0106     QChar cUp = lower ? patternIt->toUpper() : *patternIt;
0107     QChar cLow = lower ? *patternIt : patternIt->toLower();
0108     for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) {
0109         if (*strIt == cLow || *strIt == cUp) {
0110             ++patternIt;
0111             lower = patternIt->isLower();
0112             cUp = lower ? patternIt->toUpper() : *patternIt;
0113             cLow = lower ? *patternIt : patternIt->toLower();
0114         }
0115     }
0116 
0117     if (patternIt != pattern.cend()) {
0118         outScore = 0;
0119         return false;
0120     }
0121 
0122     uint8_t matches[256];
0123     return fuzzy_match(pattern, str, outScore, matches);
0124 }
0125 
0126 static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore, uint8_t *matches)
0127 {
0128     int recursionCount = 0;
0129 
0130     auto strIt = str.cbegin();
0131     auto patternIt = pattern.cbegin();
0132     const auto patternEnd = pattern.cend();
0133     const auto strEnd = str.cend();
0134     int totalMatches = 0;
0135 
0136     return fuzzy_internal::fuzzy_match_recursive(patternIt, strIt, outScore, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount);
0137 }
0138 
0139 // Private implementation
0140 static bool fuzzy_internal::fuzzy_match_recursive(QStringView::const_iterator pattern,
0141                                                   QStringView::const_iterator str,
0142                                                   int &outScore,
0143                                                   const QStringView::const_iterator strBegin,
0144                                                   const QStringView::const_iterator strEnd,
0145                                                   const QStringView::const_iterator patternEnd,
0146                                                   const uint8_t *srcMatches,
0147                                                   uint8_t *matches,
0148                                                   int nextMatch,
0149                                                   int &totalMatches,
0150                                                   int &recursionCount)
0151 {
0152     static constexpr int recursionLimit = 10;
0153     // max number of matches allowed, this should be enough
0154     static constexpr int maxMatches = 256;
0155 
0156     // Count recursions
0157     ++recursionCount;
0158     if (recursionCount >= recursionLimit) {
0159         return false;
0160     }
0161 
0162     // Detect end of strings
0163     if (pattern == patternEnd || str == strEnd) {
0164         return false;
0165     }
0166 
0167     // Recursion params
0168     bool recursiveMatch = false;
0169     uint8_t bestRecursiveMatches[maxMatches];
0170     int bestRecursiveScore = 0;
0171 
0172     // Loop through pattern and str looking for a match
0173     bool firstMatch = true;
0174     QChar currentPatternChar = toLower(*pattern);
0175     // Are we matching in sequence start from start?
0176     while (pattern != patternEnd && str != strEnd) {
0177         // Found match
0178         if (currentPatternChar == toLower(*str)) {
0179             // Supplied matches buffer was too short
0180             if (nextMatch >= maxMatches) {
0181                 return false;
0182             }
0183 
0184             // "Copy-on-Write" srcMatches into matches
0185             if (firstMatch && srcMatches) {
0186                 memcpy(matches, srcMatches, nextMatch);
0187                 firstMatch = false;
0188             }
0189 
0190             // Recursive call that "skips" this match
0191             uint8_t recursiveMatches[maxMatches];
0192             int recursiveScore = 0;
0193             const auto strNextChar = std::next(str);
0194             if (fuzzy_match_recursive(pattern,
0195                                       strNextChar,
0196                                       recursiveScore,
0197                                       strBegin,
0198                                       strEnd,
0199                                       patternEnd,
0200                                       matches,
0201                                       recursiveMatches,
0202                                       nextMatch,
0203                                       totalMatches,
0204                                       recursionCount)) {
0205                 // Pick best recursive score
0206                 if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
0207                     memcpy(bestRecursiveMatches, recursiveMatches, maxMatches);
0208                     bestRecursiveScore = recursiveScore;
0209                 }
0210                 recursiveMatch = true;
0211             }
0212 
0213             // Advance
0214             matches[nextMatch++] = (uint8_t)(std::distance(strBegin, str));
0215             ++pattern;
0216             currentPatternChar = toLower(*pattern);
0217         }
0218         ++str;
0219     }
0220 
0221     // Determine if full pattern was matched
0222     const bool matched = pattern == patternEnd;
0223 
0224     // Calculate score
0225     if (matched) {
0226         static constexpr int firstSepScoreDiff = 3;
0227 
0228         static constexpr int sequentialBonus = 20;
0229         static constexpr int separatorBonus = 25; // bonus if match occurs after a separator
0230         static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched
0231         static constexpr int firstLetterSepMatchBonus = firstLetterBonus - firstSepScoreDiff; // bonus if the first matched letter is camel or separator
0232 
0233         static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter
0234 
0235         int nonBeginSequenceBonus = 10;
0236         // points by which nonBeginSequenceBonus is increment on every matched letter
0237         static constexpr int nonBeginSequenceIncrement = 4;
0238 
0239         // Initialize score
0240         outScore = 100;
0241 
0242 #define debug_algo 0
0243 #if debug_algo
0244 #define dbg(...) qDebug(__VA_ARGS__)
0245 #else
0246 #define dbg(...)
0247 #endif
0248 
0249         // Apply unmatched penalty
0250         const int unmatched = (int)(std::distance(strBegin, strEnd)) - nextMatch;
0251         outScore += unmatchedLetterPenalty * unmatched;
0252         dbg("unmatchedLetterPenalty, unmatched count: %d, outScore: %d", unmatched, outScore);
0253 
0254         bool inSeparatorSeq = false;
0255         int i = 0;
0256         if (matches[i] == 0) {
0257             // First letter match has the highest score
0258             outScore += firstLetterBonus + separatorBonus;
0259             dbg("firstLetterBonus, outScore: %d", outScore);
0260             inSeparatorSeq = true;
0261         } else {
0262             const QChar neighbor = *(strBegin + matches[i] - 1);
0263             const QChar curr = *(strBegin + matches[i]);
0264             const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' ');
0265             if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) {
0266                 // the first letter that got matched was a sepcial char .e., camel or at a separator
0267                 outScore += firstLetterSepMatchBonus + separatorBonus;
0268                 dbg("firstLetterSepMatchBonus at %d, letter: %c, outScore: %d", matches[i], curr.toLatin1(), outScore);
0269                 inSeparatorSeq = true;
0270             } else {
0271                 // nothing
0272                 nonBeginSequenceBonus += nonBeginSequenceIncrement;
0273             }
0274             // We didn't match any special positions, apply leading penalty
0275             outScore += -(matches[i]);
0276             dbg("LeadingPenalty because no first letter match, outScore: %d", outScore);
0277         }
0278         i++;
0279 
0280         bool allConsecutive = true;
0281         // Apply ordering bonuses
0282         for (; i < nextMatch; ++i) {
0283             const uint8_t currIdx = matches[i];
0284             const uint8_t prevIdx = matches[i - 1];
0285             // Sequential
0286             if (currIdx == (prevIdx + 1)) {
0287                 if (i == matches[i]) {
0288                     // We are in a sequence beginning from first letter
0289                     outScore += sequentialBonus;
0290                     dbg("sequentialBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
0291                 } else if (inSeparatorSeq) {
0292                     // we are in a sequnce beginning from a separator like camelHump or underscore
0293                     outScore += sequentialBonus - firstSepScoreDiff;
0294                     dbg("in separator seq, [sequentialBonus - 5] at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
0295                 } else {
0296                     // We are in a random sequence
0297                     outScore += nonBeginSequenceBonus;
0298                     nonBeginSequenceBonus += nonBeginSequenceIncrement;
0299                     dbg("nonBeginSequenceBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
0300                 }
0301             } else {
0302                 allConsecutive = false;
0303 
0304                 // there is a gap between matching chars, apply penalty
0305                 int penalty = -((currIdx - prevIdx)) - 2;
0306                 outScore += penalty;
0307                 inSeparatorSeq = false;
0308                 nonBeginSequenceBonus = 10;
0309                 dbg("gap penalty[%d] at %d, letter: %c, outScore: %d", penalty, matches[i], (strBegin + currIdx)->toLatin1(), outScore);
0310             }
0311 
0312             // Check for bonuses based on neighbor character value
0313             // Camel case
0314             const QChar neighbor = *(strBegin + currIdx - 1);
0315             const QChar curr = *(strBegin + currIdx);
0316             // if camel case bonus, then not snake / separator.
0317             // This prevents double bonuses
0318             const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' ');
0319             if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) {
0320                 outScore += separatorBonus;
0321                 dbg("separatorBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore);
0322                 inSeparatorSeq = true;
0323                 continue;
0324             }
0325         }
0326 
0327         if (allConsecutive && nextMatch >= 4) {
0328             outScore *= 2;
0329             dbg("allConsecutive double the score, outScore: %d", outScore);
0330         }
0331     }
0332 
0333     totalMatches = nextMatch;
0334     // Return best result
0335     if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
0336         // Recursive score is better than "this"
0337         memcpy(matches, bestRecursiveMatches, maxMatches);
0338         outScore = bestRecursiveScore;
0339         return true;
0340     } else if (matched) {
0341         // "this" score is better than recursive
0342         return true;
0343     } else {
0344         // no match
0345         return false;
0346     }
0347 }
0348 
0349 static QString to_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose)
0350 {
0351     /**
0352      * FIXME Don't do so many appends. Instead consider using some interval based solution to wrap a range
0353      * of text with the html <tag></tag>
0354      */
0355     int j = 0;
0356     for (int i = 0; i < str.size() && j < pattern.size(); ++i) {
0357         if (fuzzy_internal::toLower(str.at(i)) == fuzzy_internal::toLower(pattern.at(j))) {
0358             str.replace(i, 1, htmlTag + str.at(i) + htmlTagClose);
0359             i += htmlTag.size() + htmlTagClose.size();
0360             ++j;
0361         }
0362     }
0363     return str;
0364 }
0365 
0366 static QString to_scored_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose)
0367 {
0368     if (pattern.isEmpty()) {
0369         return str;
0370     }
0371 
0372     uint8_t matches[256];
0373     int totalMatches = 0;
0374     {
0375         int score = 0;
0376         int recursionCount = 0;
0377 
0378         auto strIt = str.cbegin();
0379         auto patternIt = pattern.cbegin();
0380         const auto patternEnd = pattern.cend();
0381         const auto strEnd = str.cend();
0382 
0383         fuzzy_internal::fuzzy_match_recursive(patternIt, strIt, score, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount);
0384     }
0385 
0386     int offset = 0;
0387     for (int i = 0; i < totalMatches; ++i) {
0388         str.insert(matches[i] + offset, htmlTag);
0389         offset += htmlTag.size();
0390         str.insert(matches[i] + offset + 1, htmlTagClose);
0391         offset += htmlTagClose.size();
0392     }
0393 
0394     return str;
0395 }
0396 
0397 Q_DECL_UNUSED static QList<QTextLayout::FormatRange>
0398 get_fuzzy_match_formats(const QStringView pattern, const QStringView str, int offset, const QTextCharFormat &fmt)
0399 {
0400     QList<QTextLayout::FormatRange> ranges;
0401     if (pattern.isEmpty()) {
0402         return ranges;
0403     }
0404 
0405     int totalMatches = 0;
0406     int score = 0;
0407     int recursionCount = 0;
0408 
0409     auto strIt = str.cbegin();
0410     auto patternIt = pattern.cbegin();
0411     const auto patternEnd = pattern.cend();
0412     const auto strEnd = str.cend();
0413 
0414     uint8_t matches[256];
0415     fuzzy_internal::fuzzy_match_recursive(patternIt, strIt, score, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount);
0416 
0417     int j = 0;
0418     for (int i = 0; i < totalMatches; ++i) {
0419         auto matchPos = matches[i];
0420         if (!ranges.isEmpty() && matchPos == j + 1) {
0421             ranges.last().length++;
0422         } else {
0423             ranges.append({matchPos + offset, 1, fmt});
0424         }
0425         j = matchPos;
0426     }
0427 
0428     return ranges;
0429 }
0430 
0431 } // namespace kfts