File indexing completed on 2024-04-21 03:51:41
0001 /* 0002 This file is part of the KDE Baloo project. 0003 SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org> 0004 0005 SPDX-License-Identifier: LGPL-2.1-or-later 0006 */ 0007 0008 #include "termgenerator.h" 0009 0010 #include <QTextBoundaryFinder> 0011 0012 using namespace Baloo; 0013 0014 namespace { 0015 0016 QString normalizeTerm(const QString &str) 0017 { 0018 // Remove all accents. It is important to call toLower after normalization, 0019 // since some exotic unicode symbols can remain uppercase 0020 const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower(); 0021 0022 QString cleanString; 0023 cleanString.reserve(denormalized.size()); 0024 for (const auto& c : denormalized) { 0025 if (!c.isMark()) { 0026 cleanString.append(c); 0027 } 0028 } 0029 0030 return cleanString.normalized(QString::NormalizationForm_KC); 0031 } 0032 0033 void appendTerm(QByteArrayList &list, const QString &term) 0034 { 0035 if (!term.isEmpty()) { 0036 // Truncate the string to avoid arbitrarily long terms 0037 list << QStringView(term).left(TermGenerator::maxTermSize).toUtf8(); 0038 } 0039 } 0040 0041 } 0042 0043 TermGenerator::TermGenerator(Document& doc) 0044 : m_doc(doc) 0045 , m_position(1) 0046 { 0047 } 0048 0049 void TermGenerator::indexText(const QString& text) 0050 { 0051 indexText(text, QByteArray()); 0052 } 0053 0054 QByteArrayList TermGenerator::termList(const QString& text_) 0055 { 0056 QString text(text_); 0057 text.replace(QLatin1Char('_'), QLatin1Char(' ')); 0058 0059 int start = 0; 0060 0061 auto isSkipChar = [] (const QChar& c) { 0062 return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate()); 0063 }; 0064 0065 QByteArrayList list; 0066 QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text); 0067 for (; bf.position() != -1; bf.toNextBoundary()) { 0068 int end = bf.position(); 0069 while (start < end && isSkipChar(text[start])) { 0070 start++; 0071 } 0072 if (end == start) { 0073 continue; 0074 } 0075 0076 // Typically we commit a term when we have an EndOfItem, starting 0077 // from the last StartOfItem, everything between last EndOfItem and 0078 // StartOfItem is just whitespace and punctuation. Unfortunately, 0079 // most CJK characters do not trigger a StartOfItem and thus no 0080 // EndOfItem, so everything in front of a StartOfItem has to be 0081 // committed as well 0082 bool commit = bf.boundaryReasons() & (QTextBoundaryFinder::EndOfItem | QTextBoundaryFinder::StartOfItem); 0083 0084 // Also commit term if end-of-text is reached or when we find 0085 // any punctuation 0086 if (!commit & (end == text.size() || isSkipChar(text[end]))) { 0087 commit = true; 0088 } 0089 0090 if (commit) { 0091 const QString term = normalizeTerm(text.mid(start, end - start)); 0092 appendTerm(list, term); 0093 start = end; 0094 } 0095 } 0096 return list; 0097 } 0098 0099 void TermGenerator::indexText(const QString& text, const QByteArray& prefix) 0100 { 0101 const QByteArrayList terms = termList(text); 0102 if (terms.size() == 1) { 0103 QByteArray finalArr = prefix + terms[0]; 0104 m_doc.addTerm(finalArr); 0105 return; 0106 } 0107 for (const QByteArray& term : terms) { 0108 QByteArray finalArr = prefix + term; 0109 0110 m_doc.addPositionTerm(finalArr, m_position); 0111 m_position++; 0112 } 0113 m_position++; 0114 } 0115 0116 void TermGenerator::indexFileNameText(const QString& text) 0117 { 0118 const QByteArray prefix = QByteArrayLiteral("F"); 0119 const QByteArrayList terms = termList(text); 0120 if (terms.size() == 1) { 0121 QByteArray finalArr = prefix + terms[0]; 0122 m_doc.addFileNameTerm(finalArr); 0123 return; 0124 } 0125 for (const QByteArray& term : terms) { 0126 QByteArray finalArr = prefix + term; 0127 0128 m_doc.addFileNamePositionTerm(finalArr, m_position); 0129 m_position++; 0130 } 0131 m_position++; 0132 } 0133 0134 void TermGenerator::indexXattrText(const QString& text, const QByteArray& prefix) 0135 { 0136 const QByteArrayList terms = termList(text); 0137 if (terms.size() == 1) { 0138 QByteArray finalArr = prefix + terms[0]; 0139 m_doc.addXattrTerm(finalArr); 0140 return; 0141 } 0142 for (const QByteArray& term : terms) { 0143 QByteArray finalArr = prefix + term; 0144 0145 m_doc.addXattrPositionTerm(finalArr, m_position); 0146 m_position++; 0147 } 0148 m_position++; 0149 } 0150 0151 int TermGenerator::position() const 0152 { 0153 return m_position; 0154 } 0155 0156 void TermGenerator::setPosition(int position) 0157 { 0158 m_position = position; 0159 }