File indexing completed on 2024-06-16 04:20:01

0001 /*
0002     Kchmviewer - a CHM and EPUB file viewer with broad language support
0003     SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com
0004 
0005     SPDX-License-Identifier: GPL-3.0-or-later
0006 */
0007 
0008 #include <QApplication>
0009 
0010 #include "ebook.h"
0011 #include "ebook_search.h"
0012 
0013 // Helper class to simplicity state management and data keeping
0014 class SearchDataKeeper
0015 {
0016 public:
0017     SearchDataKeeper()
0018     {
0019         m_inPhrase = false;
0020     }
0021 
0022     void beginPhrase()
0023     {
0024         phrase_terms.clear();
0025         m_inPhrase = true;
0026     }
0027 
0028     void endPhrase()
0029     {
0030         m_inPhrase = false;
0031         phrasewords += phrase_terms;
0032         phrases.push_back(phrase_terms.join(QStringLiteral(" ")));
0033     }
0034 
0035     bool isInPhrase() const
0036     {
0037         return m_inPhrase;
0038     }
0039 
0040     void addTerm(const QString &term)
0041     {
0042         if (!term.isEmpty()) {
0043             terms.push_back(term);
0044 
0045             if (m_inPhrase) {
0046                 phrase_terms.push_back(term);
0047             }
0048         }
0049     }
0050 
0051     // Should contain all the search terms present in query, includind those from phrases. One element - one term .
0052     QStringList terms;
0053 
0054     // Should contain phrases present in query without quotes. One element - one phrase.
0055     QStringList phrases;
0056 
0057     // Should contain all the terms present in all the phrases (but not outside).
0058     QStringList phrasewords;
0059 
0060 private:
0061     bool m_inPhrase;
0062     QStringList phrase_terms;
0063 };
0064 
0065 EBookSearch::EBookSearch()
0066 {
0067     m_Index = nullptr;
0068 }
0069 
0070 EBookSearch::~EBookSearch()
0071 {
0072     delete m_Index;
0073 }
0074 
0075 bool EBookSearch::loadIndex(QDataStream &stream)
0076 {
0077     delete m_Index;
0078 
0079     m_Index = new QtAs::Index();
0080     return m_Index->readDict(stream);
0081 }
0082 
0083 bool EBookSearch::generateIndex(EBook *ebookFile, QDataStream &stream)
0084 {
0085     QList<QUrl> documents;
0086     QList<QUrl> alldocuments;
0087 
0088     Q_EMIT progressStep(0, QStringLiteral("Generating the list of documents"));
0089     processEvents();
0090 
0091     // Enumerate the documents
0092     if (!ebookFile->enumerateFiles(alldocuments)) {
0093         return false;
0094     }
0095 
0096     if (m_Index) {
0097         delete m_Index;
0098     }
0099 
0100     m_Index = new QtAs::Index();
0101     connect(m_Index, &QtAs::Index::indexingProgress, this, &EBookSearch::updateProgress);
0102 
0103     // Process the list of files in CHM archive and keep only HTML document files from there
0104     for (const QUrl &allDocumentsI : std::as_const(alldocuments)) {
0105         const QString docpath = allDocumentsI.path();
0106 
0107         if (docpath.endsWith(QLatin1String(".html"), Qt::CaseInsensitive) || docpath.endsWith(QLatin1String(".htm"), Qt::CaseInsensitive) || docpath.endsWith(QLatin1String(".xhtml"), Qt::CaseInsensitive)) {
0108             documents.push_back(allDocumentsI);
0109         }
0110     }
0111 
0112     if (!m_Index->makeIndex(documents, ebookFile)) {
0113         delete m_Index;
0114         m_Index = nullptr;
0115         return false;
0116     }
0117 
0118     m_Index->writeDict(stream);
0119     m_keywordDocuments.clear();
0120 
0121     return true;
0122 }
0123 
0124 void EBookSearch::cancelIndexGeneration()
0125 {
0126     m_Index->setLastWinClosed();
0127 }
0128 
0129 void EBookSearch::updateProgress(int value, const QString &stepName)
0130 {
0131     Q_EMIT progressStep(value, stepName);
0132 }
0133 
0134 void EBookSearch::processEvents()
0135 {
0136     // Do it up to ten times; some events generate other events
0137     for (int i = 0; i < 10; i++) {
0138         qApp->processEvents(QEventLoop::ExcludeUserInputEvents);
0139     }
0140 }
0141 
0142 bool EBookSearch::searchQuery(const QString &query, QList<QUrl> *results, EBook *ebookFile, unsigned int limit)
0143 {
0144     // We should have index
0145     if (!m_Index) {
0146         return false;
0147     }
0148 
0149     // Characters which split the words. We need to make them separate tokens
0150     QString splitChars = m_Index->getCharsSplit();
0151 
0152     // Characters which are part of the word. We should keep them apart.
0153     QString partOfWordChars = m_Index->getCharsPartOfWord();
0154 
0155     // Variables to store current state
0156     SearchDataKeeper keeper;
0157     QString term;
0158 
0159     for (const QChar &iChar : query) {
0160         const QChar ch = iChar.toLower();
0161 
0162         // a quote either begins or ends the phrase
0163         if (ch == QLatin1Char('"')) {
0164             keeper.addTerm(term);
0165 
0166             if (keeper.isInPhrase()) {
0167                 keeper.endPhrase();
0168             } else {
0169                 keeper.beginPhrase();
0170             }
0171 
0172             continue;
0173         }
0174 
0175         // If new char does not stop the word, add ot and continue
0176         if (ch.isLetterOrNumber() || partOfWordChars.indexOf(ch) != -1) {
0177             term.append(ch);
0178             continue;
0179         }
0180 
0181         // If it is a split char, add this term and split char as separate term
0182         if (splitChars.indexOf(ch) != -1) {
0183             // Add existing term if present
0184             keeper.addTerm(term);
0185 
0186             // Change the term variable, so it will be added when we exit this block
0187             term = ch;
0188         }
0189 
0190         // Just add the word; it is most likely a space or terminated by tokenizer.
0191         keeper.addTerm(term);
0192         term = QString();
0193     }
0194 
0195     keeper.addTerm(term);
0196 
0197     if (keeper.isInPhrase()) {
0198         return false;
0199     }
0200 
0201     QList<QUrl> foundDocs = m_Index->query(keeper.terms, keeper.phrases, keeper.phrasewords, ebookFile);
0202 
0203     for (QList<QUrl>::iterator it = foundDocs.begin(); it != foundDocs.end() && limit > 0; ++it, limit--) {
0204         results->push_back(*it);
0205     }
0206 
0207     return true;
0208 }
0209 
0210 bool EBookSearch::hasIndex() const
0211 {
0212     return m_Index != nullptr;
0213 }