File indexing completed on 2024-04-21 16:29:08
0001 /* 0002 * SPDX-FileCopyrightText: 2020 Andreas Cord-Landwehr <cordlandwehr@kde.org> 0003 * 0004 * SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL 0005 */ 0006 0007 #include "skipparser.h" 0008 #include <QDebug> 0009 #include <functional> 0010 #include <optional> 0011 #include <set> 0012 0013 // for performance reasons, we need to have to lists 0014 // but they must be kept in sync 0015 const QRegularExpression SkipParser::sSkipCharDetection("[ |\\\n|\\\t|/|\\-|\\*|#]"); 0016 constexpr bool isSkipChar(const QChar &character) 0017 { 0018 switch (character.toLatin1()) { 0019 case ' ': 0020 return true; 0021 case '\n': 0022 return true; 0023 case '\t': 0024 return true; 0025 case '/': 0026 return true; 0027 case '-': 0028 return true; 0029 case '*': 0030 return true; 0031 case '#': 0032 return true; 0033 default: 0034 return false; 0035 } 0036 return false; 0037 } 0038 0039 std::optional<std::pair<int, int>> SkipParser::findMatchNaive(QString text, QString pattern) const 0040 { 0041 if (pattern.length() > text.length()) { 0042 return {}; 0043 } 0044 0045 int start = 0; 0046 while (start <= text.length() - pattern.length()) { 0047 bool matchFound {true}; 0048 int patternSkipOffset {0}; 0049 int textSkipOffset {0}; 0050 for (int i = 0; i < pattern.length(); ++i) { 0051 // skip pattern char if contained in skip-list 0052 // outcondition: valid char that can be matched 0053 if (isSkipChar(pattern.at(i))) { 0054 ++patternSkipOffset; 0055 continue; 0056 } 0057 while (isSkipChar(text.at(start + textSkipOffset + i - patternSkipOffset))) { 0058 if (start + textSkipOffset + i - patternSkipOffset + 1 >= text.length() - 1) { 0059 matchFound = false; // we know this, because otherwise we would not be in this loop 0060 break; 0061 } 0062 ++textSkipOffset; 0063 } 0064 if (text.at(start + textSkipOffset + i - patternSkipOffset) != pattern.at(i)) { 0065 matchFound = false; 0066 break; 0067 } 0068 } 0069 if (matchFound) { 0070 return std::optional<std::pair<int, int>> {{start, start + textSkipOffset + pattern.length() - patternSkipOffset - 1}}; 0071 } else { 0072 ++start; 0073 } 0074 } 0075 0076 return {}; 0077 } 0078 0079 std::vector<int> SkipParser::computeKmpPrefix(const QString &pattern) const 0080 { 0081 const int length = pattern.length(); 0082 std::vector<int> prefix(length); 0083 prefix[0] = 0; 0084 int k = 0; 0085 for (int q = 2; q <= length; ++q) { 0086 while (k > 0 && pattern.at(k) != pattern.at(q - 1)) { 0087 k = prefix.at(k - 1); 0088 } 0089 if (pattern.at(k) == pattern.at(q - 1)) { 0090 k = k + 1; 0091 } 0092 prefix[q - 1] = k; 0093 } 0094 return prefix; 0095 } 0096 0097 std::optional<std::pair<int, int>> SkipParser::findMatchKMP(std::vector<QChar> prunedText, std::vector<int> textSkipPrefix, QString pattern) const 0098 { 0099 // obtain prefix 0100 std::vector<int> prefix; 0101 if (mPrefixCache.contains(pattern)) { 0102 prefix = mPrefixCache[pattern]; 0103 } else { 0104 prefix = computeKmpPrefix(pattern); 0105 mPrefixCache.insert(pattern, prefix); 0106 } 0107 0108 // KMP Matcher 0109 const int textLength = prunedText.size(); 0110 int q = 0; 0111 for (int i = 1; i <= textLength; ++i) { 0112 while (q > 0 && pattern.at(q) != prunedText.at(i - 1)) { 0113 q = prefix.at(q - 1); 0114 } 0115 if (pattern.at(q) == prunedText.at(i - 1)) { 0116 q = q + 1; 0117 } 0118 if (q == pattern.length()) { 0119 return std::optional<std::pair<int, int>> {{i - q + textSkipPrefix.at(i - q), i - 1 + textSkipPrefix.at(i - 1)}}; 0120 } 0121 } 0122 0123 return {}; 0124 } 0125 0126 std::optional<std::pair<int, int>> SkipParser::findMatch(QString text, QString pattern) const 0127 { 0128 auto textPreprocessing = computeTextSkipPrefix(text); 0129 auto match = findMatchKMP(textPreprocessing.first, textPreprocessing.second, pattern.remove(sSkipCharDetection)); 0130 qDebug() << "text :" << text; 0131 qDebug() << "pattern:" << pattern; 0132 // qDebug() << "start: " << start; 0133 // qDebug() << "skip: " << textSkipOffset << " / " << patternSkipOffset; 0134 return match; 0135 } 0136 0137 std::pair<std::vector<QChar>, std::vector<int>> SkipParser::computeTextSkipPrefix(const QString &text) const 0138 { 0139 // prune text and compute skip prefix 0140 // skip prefix notes number of skipped chars for each position in "text" 0141 std::vector<int> textSkipPrefix(text.size()); 0142 std::vector<QChar> prunedText; 0143 prunedText.reserve(text.size()); 0144 int skipped = 0; 0145 for (int i = 0; i < text.size(); ++i) { 0146 if (isSkipChar(text.at(i))) { 0147 textSkipPrefix.at(i - skipped) = skipped; 0148 ++skipped; 0149 } else { 0150 prunedText.push_back(text.at(i)); 0151 textSkipPrefix.at(i - skipped) = skipped; 0152 } 0153 } 0154 prunedText.shrink_to_fit(); 0155 return {prunedText, textSkipPrefix}; 0156 } 0157 0158 std::optional<std::pair<int, int>> SkipParser::findMatch(QString text, QVector<QString> patterns) const 0159 { 0160 // KMP can work with pruned patterns 0161 QSet<QString> prunedPatterns; 0162 // prune all skip chars from pattern 0163 for (const auto &pattern : patterns) { 0164 QString tmpPattern = pattern; 0165 tmpPattern.remove(sSkipCharDetection); 0166 prunedPatterns.insert(tmpPattern); 0167 } 0168 // qDebug() << "Pruned canonical texts:" << (patterns.count() - prunedPatterns.count()); 0169 0170 auto textPreprocessing = computeTextSkipPrefix(text); 0171 for (const auto &pattern : prunedPatterns) { 0172 if (auto match = findMatchKMP(textPreprocessing.first, textPreprocessing.second, pattern)) { 0173 return match; 0174 } 0175 } 0176 return {}; 0177 }