File indexing completed on 2024-06-16 04:20:01
0001 /* 0002 Kchmviewer - a CHM and EPUB file viewer with broad language support 0003 SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com 0004 0005 SPDX-License-Identifier: GPL-3.0-or-later 0006 */ 0007 0008 #include <QApplication> 0009 0010 #include "ebook.h" 0011 #include "ebook_search.h" 0012 0013 // Helper class to simplicity state management and data keeping 0014 class SearchDataKeeper 0015 { 0016 public: 0017 SearchDataKeeper() 0018 { 0019 m_inPhrase = false; 0020 } 0021 0022 void beginPhrase() 0023 { 0024 phrase_terms.clear(); 0025 m_inPhrase = true; 0026 } 0027 0028 void endPhrase() 0029 { 0030 m_inPhrase = false; 0031 phrasewords += phrase_terms; 0032 phrases.push_back(phrase_terms.join(QStringLiteral(" "))); 0033 } 0034 0035 bool isInPhrase() const 0036 { 0037 return m_inPhrase; 0038 } 0039 0040 void addTerm(const QString &term) 0041 { 0042 if (!term.isEmpty()) { 0043 terms.push_back(term); 0044 0045 if (m_inPhrase) { 0046 phrase_terms.push_back(term); 0047 } 0048 } 0049 } 0050 0051 // Should contain all the search terms present in query, includind those from phrases. One element - one term . 0052 QStringList terms; 0053 0054 // Should contain phrases present in query without quotes. One element - one phrase. 0055 QStringList phrases; 0056 0057 // Should contain all the terms present in all the phrases (but not outside). 0058 QStringList phrasewords; 0059 0060 private: 0061 bool m_inPhrase; 0062 QStringList phrase_terms; 0063 }; 0064 0065 EBookSearch::EBookSearch() 0066 { 0067 m_Index = nullptr; 0068 } 0069 0070 EBookSearch::~EBookSearch() 0071 { 0072 delete m_Index; 0073 } 0074 0075 bool EBookSearch::loadIndex(QDataStream &stream) 0076 { 0077 delete m_Index; 0078 0079 m_Index = new QtAs::Index(); 0080 return m_Index->readDict(stream); 0081 } 0082 0083 bool EBookSearch::generateIndex(EBook *ebookFile, QDataStream &stream) 0084 { 0085 QList<QUrl> documents; 0086 QList<QUrl> alldocuments; 0087 0088 Q_EMIT progressStep(0, QStringLiteral("Generating the list of documents")); 0089 processEvents(); 0090 0091 // Enumerate the documents 0092 if (!ebookFile->enumerateFiles(alldocuments)) { 0093 return false; 0094 } 0095 0096 if (m_Index) { 0097 delete m_Index; 0098 } 0099 0100 m_Index = new QtAs::Index(); 0101 connect(m_Index, &QtAs::Index::indexingProgress, this, &EBookSearch::updateProgress); 0102 0103 // Process the list of files in CHM archive and keep only HTML document files from there 0104 for (const QUrl &allDocumentsI : std::as_const(alldocuments)) { 0105 const QString docpath = allDocumentsI.path(); 0106 0107 if (docpath.endsWith(QLatin1String(".html"), Qt::CaseInsensitive) || docpath.endsWith(QLatin1String(".htm"), Qt::CaseInsensitive) || docpath.endsWith(QLatin1String(".xhtml"), Qt::CaseInsensitive)) { 0108 documents.push_back(allDocumentsI); 0109 } 0110 } 0111 0112 if (!m_Index->makeIndex(documents, ebookFile)) { 0113 delete m_Index; 0114 m_Index = nullptr; 0115 return false; 0116 } 0117 0118 m_Index->writeDict(stream); 0119 m_keywordDocuments.clear(); 0120 0121 return true; 0122 } 0123 0124 void EBookSearch::cancelIndexGeneration() 0125 { 0126 m_Index->setLastWinClosed(); 0127 } 0128 0129 void EBookSearch::updateProgress(int value, const QString &stepName) 0130 { 0131 Q_EMIT progressStep(value, stepName); 0132 } 0133 0134 void EBookSearch::processEvents() 0135 { 0136 // Do it up to ten times; some events generate other events 0137 for (int i = 0; i < 10; i++) { 0138 qApp->processEvents(QEventLoop::ExcludeUserInputEvents); 0139 } 0140 } 0141 0142 bool EBookSearch::searchQuery(const QString &query, QList<QUrl> *results, EBook *ebookFile, unsigned int limit) 0143 { 0144 // We should have index 0145 if (!m_Index) { 0146 return false; 0147 } 0148 0149 // Characters which split the words. We need to make them separate tokens 0150 QString splitChars = m_Index->getCharsSplit(); 0151 0152 // Characters which are part of the word. We should keep them apart. 0153 QString partOfWordChars = m_Index->getCharsPartOfWord(); 0154 0155 // Variables to store current state 0156 SearchDataKeeper keeper; 0157 QString term; 0158 0159 for (const QChar &iChar : query) { 0160 const QChar ch = iChar.toLower(); 0161 0162 // a quote either begins or ends the phrase 0163 if (ch == QLatin1Char('"')) { 0164 keeper.addTerm(term); 0165 0166 if (keeper.isInPhrase()) { 0167 keeper.endPhrase(); 0168 } else { 0169 keeper.beginPhrase(); 0170 } 0171 0172 continue; 0173 } 0174 0175 // If new char does not stop the word, add ot and continue 0176 if (ch.isLetterOrNumber() || partOfWordChars.indexOf(ch) != -1) { 0177 term.append(ch); 0178 continue; 0179 } 0180 0181 // If it is a split char, add this term and split char as separate term 0182 if (splitChars.indexOf(ch) != -1) { 0183 // Add existing term if present 0184 keeper.addTerm(term); 0185 0186 // Change the term variable, so it will be added when we exit this block 0187 term = ch; 0188 } 0189 0190 // Just add the word; it is most likely a space or terminated by tokenizer. 0191 keeper.addTerm(term); 0192 term = QString(); 0193 } 0194 0195 keeper.addTerm(term); 0196 0197 if (keeper.isInPhrase()) { 0198 return false; 0199 } 0200 0201 QList<QUrl> foundDocs = m_Index->query(keeper.terms, keeper.phrases, keeper.phrasewords, ebookFile); 0202 0203 for (QList<QUrl>::iterator it = foundDocs.begin(); it != foundDocs.end() && limit > 0; ++it, limit--) { 0204 results->push_back(*it); 0205 } 0206 0207 return true; 0208 } 0209 0210 bool EBookSearch::hasIndex() const 0211 { 0212 return m_Index != nullptr; 0213 }