File indexing completed on 2024-04-28 05:49:36
0001 /* 0002 SPDX-FileCopyrightText: 2017 Forrest Smith 0003 SPDX-FileCopyrightText: 2020 Waqar Ahmed 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #pragma once 0009 0010 #include <QDebug> 0011 #include <QString> 0012 #include <QTextLayout> 0013 0014 /** 0015 * This is based on https://github.com/forrestthewoods/lib_fts/blob/master/code/fts_fuzzy_match.h 0016 * with modifications for Qt 0017 * 0018 * Dont include this file in a header file, please :) 0019 */ 0020 0021 namespace kfts 0022 { 0023 /** 0024 * @brief simple fuzzy matching of chars in @a pattern with chars in @a str sequentially 0025 */ 0026 Q_DECL_UNUSED static bool fuzzy_match_simple(const QStringView pattern, const QStringView str); 0027 0028 /** 0029 * @brief This should be the main function you should use. @a outscore is the score 0030 * of this match and should be used to sort the results later. Without sorting of the 0031 * results this function won't be as effective. 0032 */ 0033 Q_DECL_UNUSED static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore); 0034 Q_DECL_UNUSED static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore, uint8_t *matches); 0035 0036 /** 0037 * @brief get string for display in treeview / listview. This should be used from style delegate. 0038 * For example: with @a pattern = "kate", @a str = "kateapp" and @htmlTag = "<b> 0039 * the output will be <b>k</b><b>a</b><b>t</b><b>e</b>app which will be visible to user as 0040 * <b>kate</b>app. 0041 * 0042 * TODO: improve this so that we don't have to put html tags on every char probably using some kind 0043 * of interval container 0044 */ 0045 Q_DECL_UNUSED static QString to_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose); 0046 Q_DECL_UNUSED static QString 0047 to_scored_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose); 0048 } 0049 0050 namespace kfts 0051 { 0052 // Forward declarations for "private" implementation 0053 namespace fuzzy_internal 0054 { 0055 static inline constexpr QChar toLower(QChar c) 0056 { 0057 return c.isLower() ? c : c.toLower(); 0058 } 0059 0060 static bool fuzzy_match_recursive(QStringView::const_iterator pattern, 0061 QStringView::const_iterator str, 0062 int &outScore, 0063 const QStringView::const_iterator strBegin, 0064 const QStringView::const_iterator strEnd, 0065 const QStringView::const_iterator patternEnd, 0066 uint8_t const *srcMatches, 0067 uint8_t *newMatches, 0068 int nextMatch, 0069 int &totalMatches, 0070 int &recursionCount); 0071 } 0072 0073 // Public interface 0074 static bool fuzzy_match_simple(const QStringView pattern, const QStringView str) 0075 { 0076 if (pattern.length() == 1) { 0077 return str.contains(pattern, Qt::CaseInsensitive); 0078 } 0079 0080 auto patternIt = pattern.cbegin(); 0081 for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) { 0082 if (strIt->toLower() == patternIt->toLower()) { 0083 ++patternIt; 0084 } 0085 } 0086 return patternIt == pattern.cend(); 0087 } 0088 0089 static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore) 0090 { 0091 if (pattern.length() == 1) { 0092 const int found = str.indexOf(pattern, Qt::CaseInsensitive); 0093 if (found >= 0) { 0094 outScore = 250 - found; 0095 outScore += pattern.at(0) == str.at(found); 0096 return true; 0097 } else { 0098 outScore = 0; 0099 return false; 0100 } 0101 } 0102 0103 // simple substring matching to flush out non-matching stuff 0104 auto patternIt = pattern.cbegin(); 0105 bool lower = patternIt->isLower(); 0106 QChar cUp = lower ? patternIt->toUpper() : *patternIt; 0107 QChar cLow = lower ? *patternIt : patternIt->toLower(); 0108 for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) { 0109 if (*strIt == cLow || *strIt == cUp) { 0110 ++patternIt; 0111 lower = patternIt->isLower(); 0112 cUp = lower ? patternIt->toUpper() : *patternIt; 0113 cLow = lower ? *patternIt : patternIt->toLower(); 0114 } 0115 } 0116 0117 if (patternIt != pattern.cend()) { 0118 outScore = 0; 0119 return false; 0120 } 0121 0122 uint8_t matches[256]; 0123 return fuzzy_match(pattern, str, outScore, matches); 0124 } 0125 0126 static bool fuzzy_match(const QStringView pattern, const QStringView str, int &outScore, uint8_t *matches) 0127 { 0128 int recursionCount = 0; 0129 0130 auto strIt = str.cbegin(); 0131 auto patternIt = pattern.cbegin(); 0132 const auto patternEnd = pattern.cend(); 0133 const auto strEnd = str.cend(); 0134 int totalMatches = 0; 0135 0136 return fuzzy_internal::fuzzy_match_recursive(patternIt, strIt, outScore, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount); 0137 } 0138 0139 // Private implementation 0140 static bool fuzzy_internal::fuzzy_match_recursive(QStringView::const_iterator pattern, 0141 QStringView::const_iterator str, 0142 int &outScore, 0143 const QStringView::const_iterator strBegin, 0144 const QStringView::const_iterator strEnd, 0145 const QStringView::const_iterator patternEnd, 0146 const uint8_t *srcMatches, 0147 uint8_t *matches, 0148 int nextMatch, 0149 int &totalMatches, 0150 int &recursionCount) 0151 { 0152 static constexpr int recursionLimit = 10; 0153 // max number of matches allowed, this should be enough 0154 static constexpr int maxMatches = 256; 0155 0156 // Count recursions 0157 ++recursionCount; 0158 if (recursionCount >= recursionLimit) { 0159 return false; 0160 } 0161 0162 // Detect end of strings 0163 if (pattern == patternEnd || str == strEnd) { 0164 return false; 0165 } 0166 0167 // Recursion params 0168 bool recursiveMatch = false; 0169 uint8_t bestRecursiveMatches[maxMatches]; 0170 int bestRecursiveScore = 0; 0171 0172 // Loop through pattern and str looking for a match 0173 bool firstMatch = true; 0174 QChar currentPatternChar = toLower(*pattern); 0175 // Are we matching in sequence start from start? 0176 while (pattern != patternEnd && str != strEnd) { 0177 // Found match 0178 if (currentPatternChar == toLower(*str)) { 0179 // Supplied matches buffer was too short 0180 if (nextMatch >= maxMatches) { 0181 return false; 0182 } 0183 0184 // "Copy-on-Write" srcMatches into matches 0185 if (firstMatch && srcMatches) { 0186 memcpy(matches, srcMatches, nextMatch); 0187 firstMatch = false; 0188 } 0189 0190 // Recursive call that "skips" this match 0191 uint8_t recursiveMatches[maxMatches]; 0192 int recursiveScore = 0; 0193 const auto strNextChar = std::next(str); 0194 if (fuzzy_match_recursive(pattern, 0195 strNextChar, 0196 recursiveScore, 0197 strBegin, 0198 strEnd, 0199 patternEnd, 0200 matches, 0201 recursiveMatches, 0202 nextMatch, 0203 totalMatches, 0204 recursionCount)) { 0205 // Pick best recursive score 0206 if (!recursiveMatch || recursiveScore > bestRecursiveScore) { 0207 memcpy(bestRecursiveMatches, recursiveMatches, maxMatches); 0208 bestRecursiveScore = recursiveScore; 0209 } 0210 recursiveMatch = true; 0211 } 0212 0213 // Advance 0214 matches[nextMatch++] = (uint8_t)(std::distance(strBegin, str)); 0215 ++pattern; 0216 currentPatternChar = toLower(*pattern); 0217 } 0218 ++str; 0219 } 0220 0221 // Determine if full pattern was matched 0222 const bool matched = pattern == patternEnd; 0223 0224 // Calculate score 0225 if (matched) { 0226 static constexpr int firstSepScoreDiff = 3; 0227 0228 static constexpr int sequentialBonus = 20; 0229 static constexpr int separatorBonus = 25; // bonus if match occurs after a separator 0230 static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched 0231 static constexpr int firstLetterSepMatchBonus = firstLetterBonus - firstSepScoreDiff; // bonus if the first matched letter is camel or separator 0232 0233 static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter 0234 0235 int nonBeginSequenceBonus = 10; 0236 // points by which nonBeginSequenceBonus is increment on every matched letter 0237 static constexpr int nonBeginSequenceIncrement = 4; 0238 0239 // Initialize score 0240 outScore = 100; 0241 0242 #define debug_algo 0 0243 #if debug_algo 0244 #define dbg(...) qDebug(__VA_ARGS__) 0245 #else 0246 #define dbg(...) 0247 #endif 0248 0249 // Apply unmatched penalty 0250 const int unmatched = (int)(std::distance(strBegin, strEnd)) - nextMatch; 0251 outScore += unmatchedLetterPenalty * unmatched; 0252 dbg("unmatchedLetterPenalty, unmatched count: %d, outScore: %d", unmatched, outScore); 0253 0254 bool inSeparatorSeq = false; 0255 int i = 0; 0256 if (matches[i] == 0) { 0257 // First letter match has the highest score 0258 outScore += firstLetterBonus + separatorBonus; 0259 dbg("firstLetterBonus, outScore: %d", outScore); 0260 inSeparatorSeq = true; 0261 } else { 0262 const QChar neighbor = *(strBegin + matches[i] - 1); 0263 const QChar curr = *(strBegin + matches[i]); 0264 const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); 0265 if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) { 0266 // the first letter that got matched was a sepcial char .e., camel or at a separator 0267 outScore += firstLetterSepMatchBonus + separatorBonus; 0268 dbg("firstLetterSepMatchBonus at %d, letter: %c, outScore: %d", matches[i], curr.toLatin1(), outScore); 0269 inSeparatorSeq = true; 0270 } else { 0271 // nothing 0272 nonBeginSequenceBonus += nonBeginSequenceIncrement; 0273 } 0274 // We didn't match any special positions, apply leading penalty 0275 outScore += -(matches[i]); 0276 dbg("LeadingPenalty because no first letter match, outScore: %d", outScore); 0277 } 0278 i++; 0279 0280 bool allConsecutive = true; 0281 // Apply ordering bonuses 0282 for (; i < nextMatch; ++i) { 0283 const uint8_t currIdx = matches[i]; 0284 const uint8_t prevIdx = matches[i - 1]; 0285 // Sequential 0286 if (currIdx == (prevIdx + 1)) { 0287 if (i == matches[i]) { 0288 // We are in a sequence beginning from first letter 0289 outScore += sequentialBonus; 0290 dbg("sequentialBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore); 0291 } else if (inSeparatorSeq) { 0292 // we are in a sequnce beginning from a separator like camelHump or underscore 0293 outScore += sequentialBonus - firstSepScoreDiff; 0294 dbg("in separator seq, [sequentialBonus - 5] at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore); 0295 } else { 0296 // We are in a random sequence 0297 outScore += nonBeginSequenceBonus; 0298 nonBeginSequenceBonus += nonBeginSequenceIncrement; 0299 dbg("nonBeginSequenceBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore); 0300 } 0301 } else { 0302 allConsecutive = false; 0303 0304 // there is a gap between matching chars, apply penalty 0305 int penalty = -((currIdx - prevIdx)) - 2; 0306 outScore += penalty; 0307 inSeparatorSeq = false; 0308 nonBeginSequenceBonus = 10; 0309 dbg("gap penalty[%d] at %d, letter: %c, outScore: %d", penalty, matches[i], (strBegin + currIdx)->toLatin1(), outScore); 0310 } 0311 0312 // Check for bonuses based on neighbor character value 0313 // Camel case 0314 const QChar neighbor = *(strBegin + currIdx - 1); 0315 const QChar curr = *(strBegin + currIdx); 0316 // if camel case bonus, then not snake / separator. 0317 // This prevents double bonuses 0318 const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); 0319 if (neighborSeparator || (neighbor.isLower() && curr.isUpper())) { 0320 outScore += separatorBonus; 0321 dbg("separatorBonus at %d, letter: %c, outScore: %d", matches[i], (strBegin + currIdx)->toLatin1(), outScore); 0322 inSeparatorSeq = true; 0323 continue; 0324 } 0325 } 0326 0327 if (allConsecutive && nextMatch >= 4) { 0328 outScore *= 2; 0329 dbg("allConsecutive double the score, outScore: %d", outScore); 0330 } 0331 } 0332 0333 totalMatches = nextMatch; 0334 // Return best result 0335 if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) { 0336 // Recursive score is better than "this" 0337 memcpy(matches, bestRecursiveMatches, maxMatches); 0338 outScore = bestRecursiveScore; 0339 return true; 0340 } else if (matched) { 0341 // "this" score is better than recursive 0342 return true; 0343 } else { 0344 // no match 0345 return false; 0346 } 0347 } 0348 0349 static QString to_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose) 0350 { 0351 /** 0352 * FIXME Don't do so many appends. Instead consider using some interval based solution to wrap a range 0353 * of text with the html <tag></tag> 0354 */ 0355 int j = 0; 0356 for (int i = 0; i < str.size() && j < pattern.size(); ++i) { 0357 if (fuzzy_internal::toLower(str.at(i)) == fuzzy_internal::toLower(pattern.at(j))) { 0358 str.replace(i, 1, htmlTag + str.at(i) + htmlTagClose); 0359 i += htmlTag.size() + htmlTagClose.size(); 0360 ++j; 0361 } 0362 } 0363 return str; 0364 } 0365 0366 static QString to_scored_fuzzy_matched_display_string(const QStringView pattern, QString &str, const QString &htmlTag, const QString &htmlTagClose) 0367 { 0368 if (pattern.isEmpty()) { 0369 return str; 0370 } 0371 0372 uint8_t matches[256]; 0373 int totalMatches = 0; 0374 { 0375 int score = 0; 0376 int recursionCount = 0; 0377 0378 auto strIt = str.cbegin(); 0379 auto patternIt = pattern.cbegin(); 0380 const auto patternEnd = pattern.cend(); 0381 const auto strEnd = str.cend(); 0382 0383 fuzzy_internal::fuzzy_match_recursive(patternIt, strIt, score, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount); 0384 } 0385 0386 int offset = 0; 0387 for (int i = 0; i < totalMatches; ++i) { 0388 str.insert(matches[i] + offset, htmlTag); 0389 offset += htmlTag.size(); 0390 str.insert(matches[i] + offset + 1, htmlTagClose); 0391 offset += htmlTagClose.size(); 0392 } 0393 0394 return str; 0395 } 0396 0397 Q_DECL_UNUSED static QList<QTextLayout::FormatRange> 0398 get_fuzzy_match_formats(const QStringView pattern, const QStringView str, int offset, const QTextCharFormat &fmt) 0399 { 0400 QList<QTextLayout::FormatRange> ranges; 0401 if (pattern.isEmpty()) { 0402 return ranges; 0403 } 0404 0405 int totalMatches = 0; 0406 int score = 0; 0407 int recursionCount = 0; 0408 0409 auto strIt = str.cbegin(); 0410 auto patternIt = pattern.cbegin(); 0411 const auto patternEnd = pattern.cend(); 0412 const auto strEnd = str.cend(); 0413 0414 uint8_t matches[256]; 0415 fuzzy_internal::fuzzy_match_recursive(patternIt, strIt, score, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount); 0416 0417 int j = 0; 0418 for (int i = 0; i < totalMatches; ++i) { 0419 auto matchPos = matches[i]; 0420 if (!ranges.isEmpty() && matchPos == j + 1) { 0421 ranges.last().length++; 0422 } else { 0423 ranges.append({matchPos + offset, 1, fmt}); 0424 } 0425 j = matchPos; 0426 } 0427 0428 return ranges; 0429 } 0430 0431 } // namespace kfts