Warning, file /pim/akonadi-search/xapian/xapianqueryparser.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 /*
0002  * SPDX-FileCopyrightText: 2014 Vishesh Handa <me@vhanda.in>
0003  *
0004  * SPDX-License-Identifier: LGPL-2.1-or-later
0005  *
0006  */
0007 
0008 #include "xapianqueryparser.h"
0009 
0010 #include "akonadi_search_xapian_debug.h"
0011 #include <QStringList>
0012 #include <QTextBoundaryFinder>
0013 
0014 using namespace Akonadi::Search;
0015 
0016 XapianQueryParser::XapianQueryParser() = default;
0017 
0018 void XapianQueryParser::setDatabase(Xapian::Database *db)
0019 {
0020     m_db = db;
0021 }
0022 
0023 namespace
0024 {
0025 struct Term {
0026     std::string t;
0027     uint count;
0028 
0029     // pop_heap pops the largest element, we want the smallest to be popped
0030     bool operator<(const Term &rhs) const
0031     {
0032         return count > rhs.count;
0033     }
0034 };
0035 
0036 Xapian::Query makeQuery(const QString &string, int position, Xapian::Database *db)
0037 {
0038     if (!db) {
0039         const QByteArray arr = string.toUtf8();
0040         const std::string stdString(arr.constData(), arr.size());
0041         return Xapian::Query(stdString, 1, position);
0042     }
0043 
0044     // Lets just keep the top x (+1 for push_heap)
0045     static const int MaxTerms = 100;
0046     QList<Term> topTerms;
0047     topTerms.reserve(MaxTerms + 1);
0048 
0049     const std::string stdString(string.toStdString());
0050     Xapian::TermIterator it = db->allterms_begin(stdString);
0051     Xapian::TermIterator end = db->allterms_end(stdString);
0052     for (; it != end; ++it) {
0053         Term term;
0054         term.t = *it;
0055         term.count = db->get_collection_freq(term.t);
0056 
0057         if (topTerms.size() < MaxTerms) {
0058             topTerms.push_back(term);
0059             std::push_heap(topTerms.begin(), topTerms.end());
0060         } else {
0061             // Remove the term with the min count
0062             topTerms.push_back(term);
0063             std::push_heap(topTerms.begin(), topTerms.end());
0064 
0065             std::pop_heap(topTerms.begin(), topTerms.end());
0066             topTerms.pop_back();
0067         }
0068     }
0069 
0070     QList<Xapian::Query> queries;
0071     queries.reserve(topTerms.size());
0072 
0073     for (const Term &term : std::as_const(topTerms)) {
0074         queries << Xapian::Query(term.t, 1, position);
0075     }
0076 
0077     if (queries.isEmpty()) {
0078         return Xapian::Query(string.toStdString(), 1, position);
0079     }
0080     Xapian::Query finalQ(Xapian::Query::OP_SYNONYM, queries.begin(), queries.end());
0081     return finalQ;
0082 }
0083 
0084 bool containsSpace(const QString &string)
0085 {
0086     for (const QChar &ch : string) {
0087         if (ch.isSpace()) {
0088             return true;
0089         }
0090     }
0091 
0092     return false;
0093 }
0094 }
0095 
0096 Xapian::Query XapianQueryParser::parseQuery(const QString &text, const QString &prefix)
0097 {
0098     /*
0099     Xapian::QueryParser parser;
0100     parser.set_default_op(Xapian::Query::OP_AND);
0101 
0102     if (m_db)
0103         parser.set_database(*m_db);
0104 
0105     int flags = Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_PARTIAL;
0106 
0107     std::string stdString(text.toStdString());
0108     return parser.parse_query(stdString, flags);
0109     */
0110 
0111     if (text.isEmpty()) {
0112         return {};
0113     }
0114 
0115     QList<Xapian::Query> queries;
0116     QList<Xapian::Query> phraseQueries;
0117 
0118     int start = 0;
0119     int end = 0;
0120     int position = 0;
0121 
0122     bool inDoubleQuotes = false;
0123     bool inSingleQuotes = false;
0124     bool inPhrase = false;
0125 
0126     QTextBoundaryFinder bf(QTextBoundaryFinder::Word, text);
0127     for (; bf.position() != -1; bf.toNextBoundary()) {
0128         if (bf.boundaryReasons() & QTextBoundaryFinder::StartOfItem) {
0129             //
0130             // Check the previous delimiter
0131             int pos = bf.position();
0132             if (pos != end) {
0133                 QString delim = text.mid(end, pos - end);
0134                 if (delim.contains(QLatin1Char('"'))) {
0135                     if (inDoubleQuotes) {
0136                         queries << Xapian::Query(Xapian::Query::OP_PHRASE, phraseQueries.begin(), phraseQueries.end());
0137                         phraseQueries.clear();
0138                         inDoubleQuotes = false;
0139                     } else {
0140                         inDoubleQuotes = true;
0141                     }
0142                 } else if (delim.contains(QLatin1Char('\''))) {
0143                     if (inSingleQuotes) {
0144                         queries << Xapian::Query(Xapian::Query::OP_PHRASE, phraseQueries.begin(), phraseQueries.end());
0145                         phraseQueries.clear();
0146                         inSingleQuotes = false;
0147                     } else {
0148                         inSingleQuotes = true;
0149                     }
0150                 } else if (!containsSpace(delim)) {
0151                     if (!inPhrase && !queries.isEmpty()) {
0152                         phraseQueries << queries.takeLast();
0153                     }
0154                     inPhrase = true;
0155                 } else if (inPhrase && !phraseQueries.isEmpty()) {
0156                     queries << Xapian::Query(Xapian::Query::OP_PHRASE, phraseQueries.begin(), phraseQueries.end());
0157                     phraseQueries.clear();
0158                     inPhrase = false;
0159                 }
0160             }
0161 
0162             start = bf.position();
0163             continue;
0164         } else if (bf.boundaryReasons() & QTextBoundaryFinder::EndOfItem) {
0165             end = bf.position();
0166 
0167             QString str = text.mid(start, end - start);
0168 
0169             // Get the string ready for saving
0170             str = str.toLower();
0171 
0172             // Remove all accents
0173             const QString denormalized = str.normalized(QString::NormalizationForm_KD);
0174             QString cleanString;
0175             for (const QChar &ch : denormalized) {
0176                 auto cat = ch.category();
0177                 if (cat != QChar::Mark_NonSpacing && cat != QChar::Mark_SpacingCombining && cat != QChar::Mark_Enclosing) {
0178                     cleanString.append(ch);
0179                 }
0180             }
0181 
0182             str = cleanString.normalized(QString::NormalizationForm_KC);
0183             const QStringList lst = str.split(QLatin1Char('_'), Qt::SkipEmptyParts);
0184             for (const QString &t : lst) {
0185                 const QString term = prefix + t;
0186 
0187                 position++;
0188                 if (inDoubleQuotes || inSingleQuotes || inPhrase) {
0189                     const QByteArray arr = term.toUtf8();
0190                     const std::string str(arr.constData(), arr.length());
0191                     phraseQueries << Xapian::Query(str, 1, position);
0192                 } else {
0193                     if (m_autoExpand) {
0194                         queries << makeQuery(term, position, m_db);
0195                     } else {
0196                         queries << Xapian::Query(term.toStdString(), 1, position);
0197                     }
0198                 }
0199             }
0200         }
0201     }
0202 
0203     if (inPhrase) {
0204         queries << Xapian::Query(Xapian::Query::OP_PHRASE, phraseQueries.begin(), phraseQueries.end());
0205         phraseQueries.clear();
0206     }
0207 
0208     if (!phraseQueries.isEmpty()) {
0209         queries << phraseQueries;
0210         phraseQueries.clear();
0211     }
0212 
0213     if (queries.size() == 1) {
0214         return queries.first();
0215     }
0216     return {Xapian::Query::OP_AND, queries.begin(), queries.end()};
0217 }
0218 
0219 void XapianQueryParser::setAutoExapand(bool autoexpand)
0220 {
0221     m_autoExpand = autoexpand;
0222 }
0223 
0224 Xapian::Query XapianQueryParser::expandWord(const QString &word, const QString &prefix)
0225 {
0226     const std::string stdString((prefix + word).toUtf8().constData());
0227     Xapian::TermIterator it = m_db->allterms_begin(stdString);
0228     Xapian::TermIterator end = m_db->allterms_end(stdString);
0229 
0230     QList<Xapian::Query> queries;
0231     for (; it != end; ++it) {
0232         queries << Xapian::Query(*it);
0233     }
0234 
0235     if (queries.isEmpty()) {
0236         return Xapian::Query(stdString);
0237     }
0238     Xapian::Query finalQ(Xapian::Query::OP_SYNONYM, queries.begin(), queries.end());
0239     return finalQ;
0240 }