File indexing completed on 2024-04-21 05:44:41

0001 /*
0002  *  SPDX-FileCopyrightText: 2020  Andreas Cord-Landwehr <cordlandwehr@kde.org>
0003  *
0004  *  SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0005  */
0006 
0007 #include "skipparser.h"
0008 #include <QDebug>
0009 #include <functional>
0010 #include <optional>
0011 #include <set>
0012 
0013 // for performance reasons, we need to have to lists
0014 // but they must be kept in sync
0015 const QRegularExpression SkipParser::sSkipCharDetection("[ |\\\n|\\\t|/|\\-|\\*|#]");
0016 constexpr bool isSkipChar(const QChar &character)
0017 {
0018     switch (character.toLatin1()) {
0019     case ' ':
0020         return true;
0021     case '\n':
0022         return true;
0023     case '\t':
0024         return true;
0025     case '/':
0026         return true;
0027     case '-':
0028         return true;
0029     case '*':
0030         return true;
0031     case '#':
0032         return true;
0033     default:
0034         return false;
0035     }
0036     return false;
0037 }
0038 
0039 std::optional<std::pair<int, int>> SkipParser::findMatchNaive(QString text, QString pattern) const
0040 {
0041     if (pattern.length() > text.length()) {
0042         return {};
0043     }
0044 
0045     int start = 0;
0046     while (start <= text.length() - pattern.length()) {
0047         bool matchFound {true};
0048         int patternSkipOffset {0};
0049         int textSkipOffset {0};
0050         for (int i = 0; i < pattern.length(); ++i) {
0051             // skip pattern char if contained in skip-list
0052             // outcondition: valid char that can be matched
0053             if (isSkipChar(pattern.at(i))) {
0054                 ++patternSkipOffset;
0055                 continue;
0056             }
0057             while (isSkipChar(text.at(start + textSkipOffset + i - patternSkipOffset))) {
0058                 if (start + textSkipOffset + i - patternSkipOffset + 1 >= text.length() - 1) {
0059                     matchFound = false; // we know this, because otherwise we would not be in this loop
0060                     break;
0061                 }
0062                 ++textSkipOffset;
0063             }
0064             if (text.at(start + textSkipOffset + i - patternSkipOffset) != pattern.at(i)) {
0065                 matchFound = false;
0066                 break;
0067             }
0068         }
0069         if (matchFound) {
0070             return std::optional<std::pair<int, int>> {{start, start + textSkipOffset + pattern.length() - patternSkipOffset - 1}};
0071         } else {
0072             ++start;
0073         }
0074     }
0075 
0076     return {};
0077 }
0078 
0079 std::vector<int> SkipParser::computeKmpPrefix(const QString &pattern) const
0080 {
0081     const int length = pattern.length();
0082     std::vector<int> prefix(length);
0083     prefix[0] = 0;
0084     int k = 0;
0085     for (int q = 2; q <= length; ++q) {
0086         while (k > 0 && pattern.at(k) != pattern.at(q - 1)) {
0087             k = prefix.at(k - 1);
0088         }
0089         if (pattern.at(k) == pattern.at(q - 1)) {
0090             k = k + 1;
0091         }
0092         prefix[q - 1] = k;
0093     }
0094     return prefix;
0095 }
0096 
0097 std::optional<std::pair<int, int>> SkipParser::findMatchKMP(std::vector<QChar> prunedText, std::vector<int> textSkipPrefix, QString pattern) const
0098 {
0099     // obtain prefix
0100     std::vector<int> prefix;
0101     if (mPrefixCache.contains(pattern)) {
0102         prefix = mPrefixCache[pattern];
0103     } else {
0104         prefix = computeKmpPrefix(pattern);
0105         mPrefixCache.insert(pattern, prefix);
0106     }
0107 
0108     // KMP Matcher
0109     const int textLength = prunedText.size();
0110     int q = 0;
0111     for (int i = 1; i <= textLength; ++i) {
0112         while (q > 0 && pattern.at(q) != prunedText.at(i - 1)) {
0113             q = prefix.at(q - 1);
0114         }
0115         if (pattern.at(q) == prunedText.at(i - 1)) {
0116             q = q + 1;
0117         }
0118         if (q == pattern.length()) {
0119             return std::optional<std::pair<int, int>> {{i - q + textSkipPrefix.at(i - q), i - 1 + textSkipPrefix.at(i - 1)}};
0120         }
0121     }
0122 
0123     return {};
0124 }
0125 
0126 std::optional<std::pair<int, int>> SkipParser::findMatch(QString text, QString pattern) const
0127 {
0128     auto textPreprocessing = computeTextSkipPrefix(text);
0129     auto match = findMatchKMP(textPreprocessing.first, textPreprocessing.second, pattern.remove(sSkipCharDetection));
0130     qDebug() << "text   :" << text;
0131     qDebug() << "pattern:" << pattern;
0132     //            qDebug() << "start:  " << start;
0133     //            qDebug() << "skip:   " << textSkipOffset << " / " << patternSkipOffset;
0134     return match;
0135 }
0136 
0137 std::pair<std::vector<QChar>, std::vector<int>> SkipParser::computeTextSkipPrefix(const QString &text) const
0138 {
0139     // prune text and compute skip prefix
0140     // skip prefix notes number of skipped chars for each position in "text"
0141     std::vector<int> textSkipPrefix(text.size());
0142     std::vector<QChar> prunedText;
0143     prunedText.reserve(text.size());
0144     int skipped = 0;
0145     for (int i = 0; i < text.size(); ++i) {
0146         if (isSkipChar(text.at(i))) {
0147             textSkipPrefix.at(i - skipped) = skipped;
0148             ++skipped;
0149         } else {
0150             prunedText.push_back(text.at(i));
0151             textSkipPrefix.at(i - skipped) = skipped;
0152         }
0153     }
0154     prunedText.shrink_to_fit();
0155     return {prunedText, textSkipPrefix};
0156 }
0157 
0158 std::optional<std::pair<int, int>> SkipParser::findMatch(QString text, QVector<QString> patterns) const
0159 {
0160     // KMP can work with pruned patterns
0161     QSet<QString> prunedPatterns;
0162     // prune all skip chars from pattern
0163     for (const auto &pattern : patterns) {
0164         QString tmpPattern = pattern;
0165         tmpPattern.remove(sSkipCharDetection);
0166         prunedPatterns.insert(tmpPattern);
0167     }
0168     //    qDebug() << "Pruned canonical texts:" << (patterns.count() - prunedPatterns.count());
0169 
0170     auto textPreprocessing = computeTextSkipPrefix(text);
0171     for (const auto &pattern : prunedPatterns) {
0172         if (auto match = findMatchKMP(textPreprocessing.first, textPreprocessing.second, pattern)) {
0173             return match;
0174         }
0175     }
0176     return {};
0177 }