File indexing completed on 2024-04-21 03:51:41

0001 /*
0002     This file is part of the KDE Baloo project.
0003     SPDX-FileCopyrightText: 2014-2015 Vishesh Handa <vhanda@kde.org>
0004 
0005     SPDX-License-Identifier: LGPL-2.1-or-later
0006 */
0007 
0008 #include "termgenerator.h"
0009 
0010 #include <QTextBoundaryFinder>
0011 
0012 using namespace Baloo;
0013 
0014 namespace {
0015 
0016 QString normalizeTerm(const QString &str)
0017 {
0018     // Remove all accents. It is important to call toLower after normalization,
0019     // since some exotic unicode symbols can remain uppercase
0020     const QString denormalized = str.normalized(QString::NormalizationForm_KD).toLower();
0021 
0022     QString cleanString;
0023     cleanString.reserve(denormalized.size());
0024     for (const auto& c : denormalized) {
0025         if (!c.isMark()) {
0026             cleanString.append(c);
0027         }
0028     }
0029 
0030     return cleanString.normalized(QString::NormalizationForm_KC);
0031 }
0032 
0033 void appendTerm(QByteArrayList &list, const QString &term)
0034 {
0035     if (!term.isEmpty()) {
0036         // Truncate the string to avoid arbitrarily long terms
0037         list << QStringView(term).left(TermGenerator::maxTermSize).toUtf8();
0038     }
0039 }
0040 
0041 }
0042 
0043 TermGenerator::TermGenerator(Document& doc)
0044     : m_doc(doc)
0045     , m_position(1)
0046 {
0047 }
0048 
0049 void TermGenerator::indexText(const QString& text)
0050 {
0051     indexText(text, QByteArray());
0052 }
0053 
0054 QByteArrayList TermGenerator::termList(const QString& text_)
0055 {
0056     QString text(text_);
0057     text.replace(QLatin1Char('_'), QLatin1Char(' '));
0058 
0059     int start = 0;
0060 
0061     auto isSkipChar = [] (const QChar& c) {
0062         return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate());
0063     };
0064 
0065     QByteArrayList list;
0066     QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text);
0067     for (; bf.position() != -1; bf.toNextBoundary()) {
0068         int end = bf.position();
0069         while (start < end && isSkipChar(text[start])) {
0070             start++;
0071         }
0072         if (end == start) {
0073             continue;
0074         }
0075 
0076         // Typically we commit a term when we have an EndOfItem, starting
0077         // from the last StartOfItem, everything between last EndOfItem and
0078         // StartOfItem is just whitespace and punctuation. Unfortunately,
0079         // most CJK characters do not trigger a StartOfItem and thus no
0080         // EndOfItem, so everything in front of a StartOfItem has to be
0081         // committed as well
0082         bool commit = bf.boundaryReasons() & (QTextBoundaryFinder::EndOfItem | QTextBoundaryFinder::StartOfItem);
0083 
0084         // Also commit term if end-of-text is reached or when we find
0085         // any punctuation
0086         if (!commit & (end == text.size() || isSkipChar(text[end]))) {
0087             commit = true;
0088         }
0089 
0090         if (commit) {
0091             const QString term = normalizeTerm(text.mid(start, end - start));
0092             appendTerm(list, term);
0093             start = end;
0094         }
0095     }
0096     return list;
0097 }
0098 
0099 void TermGenerator::indexText(const QString& text, const QByteArray& prefix)
0100 {
0101     const QByteArrayList terms = termList(text);
0102     if (terms.size() == 1) {
0103         QByteArray finalArr = prefix + terms[0];
0104         m_doc.addTerm(finalArr);
0105         return;
0106     }
0107     for (const QByteArray& term : terms) {
0108         QByteArray finalArr = prefix + term;
0109 
0110         m_doc.addPositionTerm(finalArr, m_position);
0111         m_position++;
0112     }
0113     m_position++;
0114 }
0115 
0116 void TermGenerator::indexFileNameText(const QString& text)
0117 {
0118     const QByteArray prefix = QByteArrayLiteral("F");
0119     const QByteArrayList terms = termList(text);
0120     if (terms.size() == 1) {
0121         QByteArray finalArr = prefix + terms[0];
0122         m_doc.addFileNameTerm(finalArr);
0123         return;
0124     }
0125     for (const QByteArray& term : terms) {
0126         QByteArray finalArr = prefix + term;
0127 
0128         m_doc.addFileNamePositionTerm(finalArr, m_position);
0129         m_position++;
0130     }
0131     m_position++;
0132 }
0133 
0134 void TermGenerator::indexXattrText(const QString& text, const QByteArray& prefix)
0135 {
0136     const QByteArrayList terms = termList(text);
0137     if (terms.size() == 1) {
0138         QByteArray finalArr = prefix + terms[0];
0139         m_doc.addXattrTerm(finalArr);
0140         return;
0141     }
0142     for (const QByteArray& term : terms) {
0143         QByteArray finalArr = prefix + term;
0144 
0145         m_doc.addXattrPositionTerm(finalArr, m_position);
0146         m_position++;
0147     }
0148     m_position++;
0149 }
0150 
0151 int TermGenerator::position() const
0152 {
0153     return m_position;
0154 }
0155 
0156 void TermGenerator::setPosition(int position)
0157 {
0158     m_position = position;
0159 }