File indexing completed on 2024-04-28 03:53:48

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     SPDX-FileCopyrightText: 2017 Forrest Smith <forrestthewoods@gmail.com>
0005     SPDX-FileCopyrightText: 2021 Waqar Ahmed   <waqar.17a@gmail.com>
0006 
0007     SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009 #include "kfuzzymatcher.h"
0010 
0011 #include <QList>
0012 #include <QString>
0013 #include <QStringView>
0014 
0015 /**
0016  * Custom toLower function which is much faster than
0017  * c.toLower() directly
0018  */
0019 static inline constexpr QChar toLower(QChar c)
0020 {
0021     return c.isLower() ? c : c.toLower();
0022 }
0023 
0024 // internal
0025 // clang-format off
0026 static bool match_recursive(QStringView::const_iterator pattern,
0027                             QStringView::const_iterator str,
0028                             int &outScore,
0029                             const QStringView::const_iterator strBegin,
0030                             const QStringView::const_iterator strEnd,
0031                             const QStringView::const_iterator patternEnd,
0032                             const uint8_t *srcMatches,
0033                             uint8_t *matches,
0034                             int nextMatch,
0035                             int &totalMatches,
0036                             int &recursionCount)
0037 {
0038     static constexpr int recursionLimit = 10;
0039     // max number of matches allowed, this should be enough
0040     static constexpr int maxMatches = 256;
0041 
0042     // Count recursions
0043     ++recursionCount;
0044     if (recursionCount >= recursionLimit) {
0045         return false;
0046     }
0047 
0048     // Detect end of strings
0049     if (pattern == patternEnd || str == strEnd) {
0050         return false;
0051     }
0052 
0053     // Recursion params
0054     bool recursiveMatch = false;
0055     uint8_t bestRecursiveMatches[maxMatches];
0056     int bestRecursiveScore = 0;
0057 
0058     // Loop through pattern and str looking for a match
0059     bool firstMatch = true;
0060     QChar currentPatternChar = toLower(*pattern);
0061     // Are we matching in sequence start from start?
0062     bool matchingInSequence = true;
0063     while (pattern != patternEnd && str != strEnd) {
0064         // Found match
0065         if (currentPatternChar == toLower(*str)) {
0066             // Supplied matches buffer was too short
0067             if (nextMatch >= maxMatches) {
0068                 return false;
0069             }
0070 
0071             // "Copy-on-Write" srcMatches into matches
0072             if (firstMatch && srcMatches) {
0073                 memcpy(matches, srcMatches, nextMatch);
0074                 firstMatch = false;
0075             }
0076 
0077             // Recursive call that "skips" this match
0078             uint8_t recursiveMatches[maxMatches];
0079             int recursiveScore = 0;
0080             const auto strNextChar = std::next(str);
0081             if (!matchingInSequence && match_recursive(pattern, strNextChar, recursiveScore, strBegin,
0082                                 strEnd, patternEnd, matches, recursiveMatches,
0083                                 nextMatch, totalMatches, recursionCount)) {
0084                 // Pick best recursive score
0085                 if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
0086                     memcpy(bestRecursiveMatches, recursiveMatches, maxMatches);
0087                     bestRecursiveScore = recursiveScore;
0088                 }
0089                 recursiveMatch = true;
0090             }
0091 
0092             // Advance
0093             matches[nextMatch++] = (uint8_t)(std::distance(strBegin, str));
0094             ++pattern;
0095             currentPatternChar = toLower(*pattern);
0096         } else {
0097             matchingInSequence = false;
0098         }
0099         ++str;
0100     }
0101 
0102     // Determine if full pattern was matched
0103     const bool matched = pattern == patternEnd;
0104 
0105     // Calculate score
0106     if (matched) {
0107         static constexpr int sequentialBonus = 25;
0108         static constexpr int separatorBonus = 25; // bonus if match occurs after a separator
0109         static constexpr int camelBonus = 25; // bonus if match is uppercase and prev is lower
0110         static constexpr int firstLetterBonus = 15; // bonus if the first letter is matched
0111 
0112         static constexpr int leadingLetterPenalty = -5; // penalty applied for every letter in str before the first match
0113         static constexpr int maxLeadingLetterPenalty = -15; // maximum penalty for leading letters
0114         static constexpr int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter
0115 
0116         static constexpr int nonBeginSequenceBonus = 10;
0117 
0118         // Initialize score
0119         outScore = 100;
0120 
0121         // Apply leading letter penalty
0122         const int penalty = std::max(leadingLetterPenalty * matches[0], maxLeadingLetterPenalty);
0123 
0124         outScore += penalty;
0125 
0126         // Apply unmatched penalty
0127         const int unmatched = (int)(std::distance(strBegin, strEnd)) - nextMatch;
0128         outScore += unmatchedLetterPenalty * unmatched;
0129 
0130         uint8_t seqs[maxMatches] = {0};
0131 
0132         // Apply ordering bonuses
0133         int j = 0;
0134         for (int i = 0; i < nextMatch; ++i) {
0135             const uint8_t currIdx = matches[i];
0136 
0137             if (i > 0) {
0138                 const uint8_t prevIdx = matches[i - 1];
0139 
0140                 // Sequential
0141                 if (currIdx == (prevIdx + 1)) {
0142                     if (j > 0 && seqs[j - 1] == i - 1){
0143                         outScore += sequentialBonus;
0144                         seqs[j++] = i;
0145                     } else {
0146                         // In sequence, but from first char
0147                         outScore += nonBeginSequenceBonus;
0148                     }
0149                 }
0150             }
0151 
0152             // Check for bonuses based on neighbor character value
0153             if (currIdx > 0) {
0154                 // Camel case
0155                 const QChar neighbor = *(strBegin + currIdx - 1);
0156                 const QChar curr = *(strBegin + currIdx);
0157                 if (neighbor.isLower() && curr.isUpper()) {
0158                     outScore += camelBonus;
0159                 }
0160 
0161                 // Separator
0162                 const bool neighborSeparator = neighbor == QLatin1Char('_') || neighbor == QLatin1Char(' ');
0163                 if (neighborSeparator) {
0164                     outScore += separatorBonus;
0165                 }
0166             } else {
0167                 // First letter
0168                 outScore += firstLetterBonus;
0169                 seqs[j++] = i;
0170             }
0171         }
0172     }
0173 
0174     totalMatches = nextMatch;
0175 
0176     // Return best result
0177     if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
0178         // Recursive score is better than "this"
0179         memcpy(matches, bestRecursiveMatches, maxMatches);
0180         outScore = bestRecursiveScore;
0181         return true;
0182     } else if (matched) {
0183         // "this" score is better than recursive
0184         return true;
0185     } else {
0186         // no match
0187         return false;
0188     }
0189 }
0190 // clang-format on
0191 
0192 static bool match_internal(QStringView pattern, QStringView str, int &outScore, unsigned char *matches)
0193 {
0194     if (pattern.isEmpty()) {
0195         return true;
0196     }
0197 
0198     int recursionCount = 0;
0199 
0200     auto strIt = str.cbegin();
0201     auto patternIt = pattern.cbegin();
0202     const auto patternEnd = pattern.cend();
0203     const auto strEnd = str.cend();
0204 
0205     int total = 0;
0206     return match_recursive(patternIt, strIt, outScore, strIt, strEnd, patternEnd, nullptr, matches, 0, total, recursionCount);
0207 }
0208 
0209 /**************************************************************/
0210 
0211 bool KFuzzyMatcher::matchSimple(QStringView pattern, QStringView str)
0212 {
0213     auto patternIt = pattern.cbegin();
0214     /**
0215      * Instead of doing
0216      *
0217      *      strIt.toLower() == patternIt.toLower()
0218      *
0219      * we convert patternIt to Upper / Lower as needed and compare with strIt. This
0220      * saves us from calling toLower() on both strings, making things a little bit faster
0221      */
0222     bool lower = patternIt->isLower();
0223     QChar cUp = lower ? patternIt->toUpper() : *patternIt;
0224     QChar cLow = lower ? *patternIt : patternIt->toLower();
0225     for (auto strIt = str.cbegin(); strIt != str.cend() && patternIt != pattern.cend(); ++strIt) {
0226         if (*strIt == cLow || *strIt == cUp) {
0227             ++patternIt;
0228             lower = patternIt->isLower();
0229             cUp = lower ? patternIt->toUpper() : *patternIt;
0230             cLow = lower ? *patternIt : patternIt->toLower();
0231         }
0232     }
0233 
0234     return patternIt == pattern.cend();
0235 }
0236 
0237 KFuzzyMatcher::Result KFuzzyMatcher::match(QStringView pattern, QStringView str)
0238 {
0239     /**
0240      * Simple substring matching to flush out non-matching strings
0241      */
0242     const bool simpleMatch = matchSimple(pattern, str);
0243 
0244     KFuzzyMatcher::Result result;
0245     result.matched = false;
0246     result.score = 0;
0247 
0248     if (!simpleMatch) {
0249         return result;
0250     }
0251 
0252     // actual algorithm
0253     int score = 0;
0254     uint8_t matches[256];
0255     const bool matched = match_internal(pattern, str, score, matches);
0256     result.matched = matched;
0257     result.score = score;
0258     return result;
0259 }
0260 
0261 QList<KFuzzyMatcher::Range> KFuzzyMatcher::matchedRanges(QStringView pattern, QStringView str, RangeType type)
0262 {
0263     QList<KFuzzyMatcher::Range> ranges;
0264     if (pattern.isEmpty()) {
0265         return ranges;
0266     }
0267 
0268     int totalMatches = 0;
0269     int score = 0;
0270     int recursionCount = 0;
0271 
0272     auto strIt = str.cbegin();
0273     auto patternIt = pattern.cbegin();
0274     const auto patternEnd = pattern.cend();
0275     const auto strEnd = str.cend();
0276 
0277     uint8_t matches[256];
0278     auto res = match_recursive(patternIt, strIt, score, strIt, strEnd, patternEnd, nullptr, matches, 0, totalMatches, recursionCount);
0279     // didn't match? => We don't care about results
0280     if (!res && type == RangeType::FullyMatched) {
0281         return {};
0282     }
0283 
0284     int previousMatch = 0;
0285     for (int i = 0; i < totalMatches; ++i) {
0286         auto matchPos = matches[i];
0287         /**
0288          * Check if this match is part of the previous
0289          * match. If it is, we increase the length of
0290          * the last range.
0291          */
0292         if (!ranges.isEmpty() && matchPos == previousMatch + 1) {
0293             ranges.last().length++;
0294         } else {
0295             /**
0296              * This is a new match inside the string
0297              */
0298             ranges.push_back({/* start: */ matchPos, /* length: */ 1});
0299         }
0300         previousMatch = matchPos;
0301     }
0302 
0303     return ranges;
0304 }