File indexing completed on 2024-04-28 03:53:48
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 2017 Forrest Smith <forrestthewoods@gmail.com> 0005 SPDX-FileCopyrightText: 2021 Waqar Ahmed <waqar.17a@gmail.com> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 #include "kfuzzymatcher.h" 0010 0011 #include <QList> 0012 #include <QString> 0013 #include <QStringView> 0014 0015 /** 0016 * Custom toLower function which is much faster than 0017 * c.toLower() directly 0018 */ 0019 static inline constexpr QChar toLower(QChar c) 0020 { 0021 return c.isLower() ? c : c.toLower(); 0022 } 0023 0024 // internal 0025 // clang-format off 0026 static bool match_recursive(QStringView::const_iterator pattern, 0027 QStringView::const_iterator str, 0028 int &outScore, 0029 const QStringView::const_iterator strBegin, 0030 const QStringView::const_iterator strEnd, 0031 const QStringView::const_iterator patternEnd, 0032 const uint8_t *srcMatches, 0033 uint8_t *matches, 0034 int nextMatch, 0035 int &totalMatches, 0036 int &recursionCount) 0037 { 0038 static constexpr int recursionLimit = 10; 0039 // max number of matches allowed, this should be enough 0040 static constexpr int maxMatches = 256; 0041 0042 // Count recursions 0043 ++recursionCount; 0044 if (recursionCount >= recursionLimit) { 0045 return false; 0046 } 0047 0048 // Detect end of strings 0049 if (pattern == patternEnd || str == strEnd) { 0050 return false; 0051 } 0052 0053 // Recursion params 0054 bool recursiveMatch = false; 0055 uint8_t bestRecursiveMatches[maxMatches]; 0056 int bestRecursiveScore = 0; 0057 0058 // Loop through pattern and str looking for a match 0059 bool firstMatch = true; 0060 QChar currentPatternChar = toLower(*pattern); 0061 // Are we matching in sequence start from start? 0062 bool matchingInSequence = true; 0063 while (pattern != patternEnd && str != strEnd) { 0064 // Found match 0065 if (currentPatternChar == toLower(*str)) { 0066 // Supplied matches buffer was too short 0067 if (nextMatch >= maxMatches) { 0068 return false; 0069 } 0070 0071 // "Copy-on-Write" srcMatches into matches 0072 if (firstMatch && srcMatches) { 0073 memcpy(matches, srcMatches, nextMatch); 0074 firstMatch = false; 0075 } 0076 0077 // Recursive call that "skips" this match 0078 uint8_t recursiveMatches[maxMatches]; 0079 int recursiveScore = 0; 0080 const auto strNextChar = std::next(str); 0081 if (!matchingInSequence && match_recursive(pattern, strNextChar, recursiveScore, strBegin, 0082 strEnd, patternEnd, matches, recursiveMatches, 0083 nextMatch, totalMatches, recursionCount)) { 0084 // Pick best recursive score 0085 if (!recursiveMatch || recursiveScore > bestRecursiveScore) { 0086 memcpy(bestRecursiveMatches, recursiveMatches, maxMatches); 0087 bestRecursiveScore = recursiveScore; 0088 } 0089 recursiveMatch = true; 0090 } 0091 0092 // Advance 0093 matches[nextMatch++] = (uint8_t)(std::distance(strBegin, str)); 0094 ++pattern; 0095 currentPatternChar = toLower(*pattern); 0096 } else { 0097 matchingInSequence = false; 0098 } 0099 ++str; 0100 } 0101 0102 // Determine if full pattern was matched 0103 const bool matched = pattern == patternEnd; 0104 0105 // Calculate score 0106 if (matched) { 0107 static constexpr int sequentialBonus = 25; 0108 static constexpr int separatorBonus = 25; // bonus if match occurs after a separator 0109 static constexpr int camelBonus = 25; // bonus if match is uppercase and prev is lower 0110 static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched 0111 0112 static constexpr int leadingLetterPenalty = -5; // penalty applied for every letter in str before the first match 0113 static constexpr int maxLeadingLetterPenalty = -15; // maximum penalty for leading letters 0114 static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter 0115 0116 static constexpr int nonBeginSequenceBonus = 10; 0117 0118 // Initialize score 0119 outScore = 100; 0120 0121 // Apply leading letter penalty 0122 const int penalty = std::max(leadingLetterPenalty * matches[0], maxLeadingLetterPenalty); 0123 0124 outScore += penalty; 0125 0126 // Apply unmatched penalty 0127 const int unmatched = (int)(std::distance(strBegin, strEnd)) - nextMatch; 0128 outScore += unmatchedLetterPenalty * unmatched; 0129 0130 uint8_t seqs[maxMatches] = {0}; 0131 0132 // Apply ordering bonuses 0133 int j = 0; 0134 for (int i = 0; i < nextMatch; ++i) { 0135 const uint8_t currIdx = matches[i]; 0136 0137 if (i > 0) { 0138 const uint8_t prevIdx = matches[i - 1]; 0139 0140 // Sequential 0141 if (currIdx == (prevIdx + 1)) { 0142 if (j > 0 && seqs[j - 1] == i - 1){ 0143 outScore += sequentialBonus; 0144 seqs[j++] = i; 0145 } else { 0146 // In sequence, but from first char 0147 outScore += nonBeginSequenceBonus; 0148 } 0149 } 0150 } 0151 0152 // Check for bonuses based on neighbor character value 0153 if (currIdx > 0) { 0154 // Camel case 0155 const QChar neighbor = *(strBegin + currIdx - 1); 0156 const QChar curr = *(strBegin + currIdx); 0157 if (neighbor.isLower() && curr.isUpper()) { 0158 outScore += camelBonus; 0159 } 0160 0161 // Separator 0162 const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' '); 0163 if (neighborSeparator) { 0164 outScore += separatorBonus; 0165 } 0166 } else { 0167 // First letter 0168 outScore += firstLetterBonus; 0169 seqs[j++] = i; 0170 } 0171 } 0172 } 0173 0174 totalMatches = nextMatch; 0175 0176 // Return best result 0177 if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) { 0178 // Recursive score is better than "this" 0179 memcpy(matches, bestRecursiveMatches, maxMatches); 0180 outScore = bestRecursiveScore; 0181 return true; 0182 } else if (matched) { 0183 // "this" score is better than recursive 0184 return true; 0185 } else { 0186 // no match 0187 return false; 0188 } 0189 } 0190 // clang-format on 0191 0192 static bool match_internal(QStringView pattern, QStringView str, int &outScore, unsigned char *matches) 0193 { 0194 if (pattern.isEmpty()) { 0195 return true; 0196 } 0197 0198 int recursionCount = 0; 0199 0200 auto strIt = str.cbegin(); 0201 auto patternIt = pattern.cbegin(); 0202 const auto patternEnd = pattern.cend(); 0203 const auto strEnd = str.cend(); 0204 0205 int total = 0; 0206 return match_recursive(patternIt, strIt, outScore, strIt, strEnd, patternEnd, nullptr, matches, 0, total, recursionCount); 0207 } 0208 0209 /**************************************************************/ 0210 0211 bool KFuzzyMatcher::matchSimple(QStringView pattern, QStringView str) 0212 { 0213 auto patternIt = pattern.cbegin(); 0214 /** 0215 * Instead of doing 0216 * 0217 * strIt.toLower() == patternIt.toLower() 0218 * 0219 * we convert patternIt to Upper / Lower as needed and compare with strIt. This 0220 * saves us from calling toLower() on both strings, making things a little bit faster 0221 */ 0222 bool lower = patternIt->isLower(); 0223 QChar cUp = lower ? patternIt->toUpper() : *patternIt; 0224 QChar cLow = lower ? *patternIt : patternIt->toLower(); 0225 for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) { 0226 if (*strIt == cLow || *strIt == cUp) { 0227 ++patternIt; 0228 lower = patternIt->isLower(); 0229 cUp = lower ? patternIt->toUpper() : *patternIt; 0230 cLow = lower ? *patternIt : patternIt->toLower(); 0231 } 0232 } 0233 0234 return patternIt == pattern.cend(); 0235 } 0236 0237 KFuzzyMatcher::Result KFuzzyMatcher::match(QStringView pattern, QStringView str) 0238 { 0239 /** 0240 * Simple substring matching to flush out non-matching strings 0241 */ 0242 const bool simpleMatch = matchSimple(pattern, str); 0243 0244 KFuzzyMatcher::Result result; 0245 result.matched = false; 0246 result.score = 0; 0247 0248 if (!simpleMatch) { 0249 return result; 0250 } 0251 0252 // actual algorithm 0253 int score = 0; 0254 uint8_t matches[256]; 0255 const bool matched = match_internal(pattern, str, score, matches); 0256 result.matched = matched; 0257 result.score = score; 0258 return result; 0259 } 0260 0261 QList<KFuzzyMatcher::Range> KFuzzyMatcher::matchedRanges(QStringView pattern, QStringView str, RangeType type) 0262 { 0263 QList<KFuzzyMatcher::Range> ranges; 0264 if (pattern.isEmpty()) { 0265 return ranges; 0266 } 0267 0268 int totalMatches = 0; 0269 int score = 0; 0270 int recursionCount = 0; 0271 0272 auto strIt = str.cbegin(); 0273 auto patternIt = pattern.cbegin(); 0274 const auto patternEnd = pattern.cend(); 0275 const auto strEnd = str.cend(); 0276 0277 uint8_t matches[256]; 0278 auto res = match_recursive(patternIt, strIt, score, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount); 0279 // didn't match? => We don't care about results 0280 if (!res && type == RangeType::FullyMatched) { 0281 return {}; 0282 } 0283 0284 int previousMatch = 0; 0285 for (int i = 0; i < totalMatches; ++i) { 0286 auto matchPos = matches[i]; 0287 /** 0288 * Check if this match is part of the previous 0289 * match. If it is, we increase the length of 0290 * the last range. 0291 */ 0292 if (!ranges.isEmpty() && matchPos == previousMatch + 1) { 0293 ranges.last().length++; 0294 } else { 0295 /** 0296 * This is a new match inside the string 0297 */ 0298 ranges.push_back({/* start: */ matchPos, /* length: */ 1}); 0299 } 0300 previousMatch = matchPos; 0301 } 0302 0303 return ranges; 0304 }