File indexing completed on 2024-05-12 05:11:21

0001 /*
0002  * SPDX-FileCopyrightText: 2014 Vishesh Handa <me@vhanda.in>
0003  *
0004  * SPDX-License-Identifier: LGPL-2.1-or-later
0005  *
0006  */
0007 
0008 #include "xapiantermgenerator.h"
0009 
0010 #include "akonadi_search_xapian_debug.h"
0011 #include <QTextBoundaryFinder>
0012 
0013 using namespace Akonadi::Search;
0014 
0015 XapianTermGenerator::XapianTermGenerator(Xapian::Document *doc)
0016     : m_doc(doc)
0017 {
0018     if (doc) {
0019         m_termGen.set_document(*doc);
0020     }
0021 }
0022 
0023 void XapianTermGenerator::indexText(const QString &text)
0024 {
0025     indexText(text, QString());
0026 }
0027 
0028 void XapianTermGenerator::setDocument(Xapian::Document *doc)
0029 {
0030     m_doc = doc;
0031 }
0032 
0033 QStringList XapianTermGenerator::termList(const QString &text)
0034 {
0035     int start = 0;
0036     int end = 0;
0037 
0038     QStringList list;
0039     QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text);
0040     for (; bf.position() != -1; bf.toNextBoundary()) {
0041         if (bf.boundaryReasons() & QTextBoundaryFinder::StartOfItem) {
0042             start = bf.position();
0043             continue;
0044         } else if (bf.boundaryReasons() & QTextBoundaryFinder::EndOfItem) {
0045             end = bf.position();
0046 
0047             QString str = text.mid(start, end - start);
0048 
0049             // Get the string ready for saving
0050             str = str.toLower();
0051 
0052             // Remove all accents
0053             const QString denormalized = str.normalized(QString::NormalizationForm_KD);
0054 
0055             QString cleanString;
0056             cleanString.reserve(denormalized.size());
0057             for (const QChar &ch : denormalized) {
0058                 const auto cat = ch.category();
0059                 if (cat != QChar::Mark_NonSpacing && cat != QChar::Mark_SpacingCombining && cat != QChar::Mark_Enclosing) {
0060                     cleanString.append(ch);
0061                 }
0062             }
0063 
0064             str = cleanString.normalized(QString::NormalizationForm_KC);
0065             list << str.split(QLatin1Char('_'), Qt::SkipEmptyParts);
0066         }
0067     }
0068 
0069     return list;
0070 }
0071 
0072 void XapianTermGenerator::indexText(const QString &text, const QString &prefix, int wdfInc)
0073 {
0074     const QByteArray par = prefix.toUtf8();
0075     const QByteArray ta = text.toUtf8();
0076     m_termGen.index_text(ta.constData(), wdfInc, par.constData());
0077 
0078     const QStringList terms = termList(text);
0079     for (const QString &term : terms) {
0080         const QByteArray arr = term.toUtf8();
0081 
0082         const QByteArray finalArr = par + arr;
0083         const std::string stdString(finalArr.constData(), finalArr.size());
0084         m_doc->add_posting(stdString, m_position, wdfInc);
0085 
0086         m_position++;
0087     }
0088 }
0089 
0090 int XapianTermGenerator::position() const
0091 {
0092     return m_position;
0093 }
0094 
0095 void XapianTermGenerator::setPosition(int position)
0096 {
0097     m_position = position;
0098 }