File indexing completed on 2024-12-15 04:13:43
0001 /* 0002 Kchmviewer - a CHM and EPUB file viewer with broad language support 0003 SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com 0004 0005 SPDX-License-Identifier: GPL-3.0-or-later 0006 */ 0007 0008 #include <QApplication> 0009 #include <QTextCodec> 0010 0011 #include "ebook.h" 0012 #include "ebook_search.h" 0013 #include "helper_search_index.h" 0014 0015 static const int DICT_VERSION = 4; 0016 0017 namespace QtAs 0018 { 0019 // Those characters are splitters (i.e. split the word), but added themselves into dictionary too. 0020 // This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both 0021 // search for "print" and "->print" will find it. 0022 #define SPLIT_CHARACTERS QStringLiteral("!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`") 0023 0024 // Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug. 0025 #define WORD_CHARACTERS QStringLiteral("$_") 0026 0027 struct Term { 0028 Term() 0029 : frequency(-1) 0030 { 0031 } 0032 Term(const QString &t, int f, const QVector<Document> &l) 0033 : term(t) 0034 , frequency(f) 0035 , documents(l) 0036 { 0037 } 0038 QString term; 0039 int frequency; 0040 QVector<Document> documents; 0041 bool operator<(const Term &i2) const 0042 { 0043 return frequency < i2.frequency; 0044 } 0045 }; 0046 0047 QDataStream &operator>>(QDataStream &s, Document &l) 0048 { 0049 s >> l.docNumber; 0050 s >> l.frequency; 0051 return s; 0052 } 0053 0054 QDataStream &operator<<(QDataStream &s, const Document l) 0055 { 0056 s << (short)l.docNumber; 0057 s << (short)l.frequency; 0058 return s; 0059 } 0060 0061 Index::Index() 0062 : QObject(nullptr) 0063 { 0064 lastWindowClosed = false; 0065 connect(qApp, &QGuiApplication::lastWindowClosed, this, &Index::setLastWinClosed); 0066 } 0067 0068 void Index::setLastWinClosed() 0069 { 0070 lastWindowClosed = true; 0071 } 0072 0073 bool Index::makeIndex(const QList<QUrl> &docs, EBook *chmFile) 0074 { 0075 if (docs.isEmpty()) { 0076 return false; 0077 } 0078 0079 docList = docs; 0080 0081 if (chmFile->hasFeature(EBook::FEATURE_ENCODING)) { 0082 entityDecoder.changeEncoding(QTextCodec::codecForName(chmFile->currentEncoding().toUtf8())); 0083 } 0084 0085 QList<QUrl>::ConstIterator it = docList.constBegin(); 0086 int steps = docList.count() / 100; 0087 0088 if (!steps) { 0089 steps++; 0090 } 0091 0092 int prog = 0; 0093 0094 for (int i = 0; it != docList.constEnd(); ++it, ++i) { 0095 if (lastWindowClosed) { 0096 return false; 0097 } 0098 0099 const QUrl &filename = *it; 0100 QStringList terms; 0101 0102 if (parseDocumentToStringlist(chmFile, filename, terms)) { 0103 for (QStringList::ConstIterator tit = terms.constBegin(); tit != terms.constEnd(); ++tit) { 0104 insertInDict(*tit, i); 0105 } 0106 } 0107 0108 if (i % steps == 0) { 0109 prog++; 0110 prog = qMin(prog, 99); 0111 Q_EMIT indexingProgress(prog, tr("Processing document %1").arg((*it).path())); 0112 } 0113 } 0114 0115 Q_EMIT indexingProgress(100, tr("Processing completed")); 0116 return true; 0117 } 0118 0119 void Index::insertInDict(const QString &str, int docNum) 0120 { 0121 Entry *e = nullptr; 0122 if (!dict.isEmpty()) { 0123 e = dict[str]; 0124 } 0125 0126 if (e) { 0127 if (e->documents.last().docNumber != docNum) { 0128 e->documents.append(Document(docNum, 1)); 0129 } else { 0130 e->documents.last().frequency++; 0131 } 0132 } else { 0133 dict.insert(str, new Entry(docNum)); 0134 } 0135 } 0136 0137 bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl &filename, QStringList &tokenlist) 0138 { 0139 QString parsedbuf, parseentity, text; 0140 0141 if (!chmFile->getFileContentAsString(text, filename) || text.isEmpty()) { 0142 qWarning("Search index generator: could not retrieve the document content for %s", qPrintable(filename.toString())); 0143 return false; 0144 } 0145 0146 m_charssplit = SPLIT_CHARACTERS; 0147 m_charsword = WORD_CHARACTERS; 0148 0149 tokenlist.clear(); 0150 0151 // State machine states 0152 enum state_t { 0153 STATE_OUTSIDE_TAGS, // outside HTML tags; parse text 0154 STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag 0155 STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar) 0156 STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity 0157 }; 0158 0159 state_t state = STATE_OUTSIDE_TAGS; 0160 QChar QuoteChar; // used in STATE_IN_QUOTES 0161 0162 for (int j = 0; j < text.length(); j++) { 0163 QChar ch = text[j]; 0164 0165 if ((j % 20000) == 0) { 0166 qApp->processEvents(QEventLoop::ExcludeUserInputEvents); 0167 } 0168 0169 if (state == STATE_IN_HTML_TAG) { 0170 // We are inside HTML tag. 0171 // Ignore everything until we see '>' (end of HTML tag) or quote char (quote start) 0172 if (ch == QLatin1Char('"') || ch == QLatin1Char('\'')) { 0173 state = STATE_IN_QUOTES; 0174 QuoteChar = ch; 0175 } else if (ch == QLatin1Char('>')) { 0176 state = STATE_OUTSIDE_TAGS; 0177 } 0178 0179 continue; 0180 } else if (state == STATE_IN_QUOTES) { 0181 // We are inside quoted text inside HTML tag. 0182 // Ignore everything until we see the quote character again 0183 if (ch == QuoteChar) { 0184 state = STATE_IN_HTML_TAG; 0185 } 0186 0187 continue; 0188 } else if (state == STATE_IN_HTML_ENTITY) { 0189 // We are inside encoded HTML entity (like ). 0190 // Collect to parsedbuf everything until we see ; 0191 if (ch.isLetterOrNumber()) { 0192 // get next character of this entity 0193 parseentity.append(ch); 0194 continue; 0195 } 0196 0197 // The entity ended 0198 state = STATE_OUTSIDE_TAGS; 0199 0200 // Some shitty HTML does not terminate entities correctly. Screw it. 0201 if (ch != QLatin1Char(';') && ch != QLatin1Char('<')) { 0202 if (parseentity.isEmpty()) { 0203 // straight '&' symbol. Add and continue. 0204 parsedbuf += QLatin1String("&"); 0205 } else { 0206 qWarning("Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable(parseentity), ch.toLatin1()); 0207 } 0208 0209 j--; // parse this character again, but in different state 0210 continue; 0211 } 0212 0213 // Don't we have a space? 0214 if (parseentity.toLower() != QLatin1String("nbsp")) { 0215 QString entity = entityDecoder.decode(parseentity); 0216 0217 if (entity.isNull()) { 0218 // decodeEntity() already printed error message 0219 // qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() ); 0220 continue; 0221 } 0222 0223 parsedbuf += entity; 0224 continue; 0225 } else { 0226 ch = QLatin1Char(' '); // We got a space, so treat it like it, and not add it to parsebuf 0227 } 0228 } 0229 0230 // 0231 // Now process STATE_OUTSIDE_TAGS 0232 // 0233 0234 // Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is 0235 if (ch == QLatin1Char('<')) { 0236 state = STATE_IN_HTML_TAG; 0237 goto tokenize_buf; 0238 } 0239 0240 // Check for start of HTML entity 0241 if (ch == QLatin1Char('&')) { 0242 state = STATE_IN_HTML_ENTITY; 0243 parseentity = QString(); 0244 continue; 0245 } 0246 0247 // Replace quote by ' - quotes are used in search window to set the phrase 0248 if (ch == QLatin1Char('"')) { 0249 ch = QLatin1Char('\''); 0250 } 0251 0252 // Ok, we have a valid character outside HTML tags, and probably some in buffer already. 0253 // If it is char or letter, add it and continue 0254 if (ch.isLetterOrNumber() || m_charsword.indexOf(ch) != -1) { 0255 parsedbuf.append(ch); 0256 continue; 0257 } 0258 0259 // If it is a split char, add the word to the dictionary, and then add the char itself. 0260 if (m_charssplit.indexOf(ch) != -1) { 0261 if (!parsedbuf.isEmpty()) { 0262 tokenlist.push_back(parsedbuf.toLower()); 0263 } 0264 0265 tokenlist.push_back(ch.toLower()); 0266 parsedbuf = QString(); 0267 continue; 0268 } 0269 0270 tokenize_buf: 0271 // Just add the word; it is most likely a space or terminated by tokenizer. 0272 if (!parsedbuf.isEmpty()) { 0273 tokenlist.push_back(parsedbuf.toLower()); 0274 parsedbuf = QString(); 0275 } 0276 } 0277 0278 // Add the last word if still here - for broken htmls. 0279 if (!parsedbuf.isEmpty()) { 0280 tokenlist.push_back(parsedbuf.toLower()); 0281 } 0282 0283 return true; 0284 } 0285 0286 void Index::writeDict(QDataStream &stream) 0287 { 0288 stream << DICT_VERSION; 0289 stream << m_charssplit; 0290 stream << m_charsword; 0291 0292 // Document list 0293 stream << docList; 0294 0295 // Dictionary 0296 for (QHash<QString, Entry *>::ConstIterator it = dict.constBegin(); it != dict.constEnd(); ++it) { 0297 stream << it.key(); 0298 stream << (int)it.value()->documents.count(); 0299 stream << it.value()->documents; 0300 } 0301 } 0302 0303 bool Index::readDict(QDataStream &stream) 0304 { 0305 dict.clear(); 0306 docList.clear(); 0307 0308 QString key; 0309 int version, numOfDocs; 0310 0311 stream >> version; 0312 0313 if (version < 2) { 0314 return false; 0315 } 0316 0317 stream >> m_charssplit; 0318 stream >> m_charsword; 0319 0320 // Read the document list 0321 stream >> docList; 0322 0323 while (!stream.atEnd()) { 0324 stream >> key; 0325 stream >> numOfDocs; 0326 0327 QVector<Document> docs(numOfDocs); 0328 0329 stream >> docs; 0330 dict.insert(key, new Entry(docs)); 0331 } 0332 0333 return dict.size() > 0; 0334 } 0335 0336 QList<QUrl> Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile) 0337 { 0338 QList<Term> termList; 0339 0340 for (const auto &term : terms) { 0341 Entry *e = nullptr; 0342 0343 if (dict[term]) { 0344 e = dict[term]; 0345 termList.append(Term(term, e->documents.count(), e->documents)); 0346 } else { 0347 return QList<QUrl>(); 0348 } 0349 } 0350 0351 if (termList.isEmpty()) { 0352 return QList<QUrl>(); 0353 } 0354 0355 std::sort(termList.begin(), termList.end()); 0356 0357 QVector<Document> minDocs = termList.takeFirst().documents; 0358 for (const Term &t : std::as_const(termList)) { 0359 const QVector<Document> docs = t.documents; 0360 for (QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end();) { 0361 bool found = false; 0362 for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it) { 0363 if ((*minDoc_it).docNumber == (*doc_it).docNumber) { 0364 (*minDoc_it).frequency += (*doc_it).frequency; 0365 found = true; 0366 break; 0367 } 0368 } 0369 if (!found) { 0370 minDoc_it = minDocs.erase(minDoc_it); 0371 } else { 0372 ++minDoc_it; 0373 } 0374 } 0375 } 0376 0377 QList<QUrl> results; 0378 std::sort(minDocs.begin(), minDocs.end()); 0379 if (termSeq.isEmpty()) { 0380 for (const Document &doc : std::as_const(minDocs)) { 0381 results << docList.at((int)doc.docNumber); 0382 } 0383 return results; 0384 } 0385 0386 QUrl fileName; 0387 for (const Document &doc : std::as_const(minDocs)) { 0388 fileName = docList[(int)doc.docNumber]; 0389 if (searchForPhrases(termSeq, seqWords, fileName, chmFile)) { 0390 results << fileName; 0391 } 0392 } 0393 0394 return results; 0395 } 0396 0397 bool Index::searchForPhrases(const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook *chmFile) 0398 { 0399 QStringList parsed_document; 0400 0401 if (!parseDocumentToStringlist(chmFile, filename, parsed_document)) { 0402 return false; 0403 } 0404 0405 miniDict.clear(); 0406 0407 // Initialize the dictionary with the words in phrase(s) 0408 for (const QString &word : words) { 0409 miniDict.insert(word, new PosEntry(0)); 0410 } 0411 0412 // Fill the dictionary with the words from the document 0413 unsigned int word_offset = 3; 0414 for (QStringList::ConstIterator it = parsed_document.constBegin(); it != parsed_document.constEnd(); it++, word_offset++) { 0415 PosEntry *entry = miniDict[*it]; 0416 0417 if (entry) { 0418 entry->positions.append(word_offset); 0419 } 0420 } 0421 0422 // Dump it 0423 /* 0424 QDictIterator<PosEntry> it( miniDict ); 0425 for( ; it.current(); ++it ) 0426 { 0427 QString text( it.currentKey() ); 0428 QValueList<uint> pos = miniDict[text]->positions; 0429 for ( unsigned int i = 1; i < pos.size(); i++ ) 0430 text += " " + QString::number( pos[i] ); 0431 0432 qDebug( "%s", text.ascii()); 0433 } 0434 */ 0435 0436 QList<uint> first_word_positions; 0437 0438 for (QStringList::ConstIterator phrase_it = phrases.constBegin(); phrase_it != phrases.constEnd(); phrase_it++) { 0439 QStringList phrasewords = phrase_it->split(QLatin1Char(' ')); 0440 first_word_positions = miniDict[phrasewords[0]]->positions; 0441 0442 for (int j = 1; j < phrasewords.count(); ++j) { 0443 QList<uint> next_word_it = miniDict[phrasewords[j]]->positions; 0444 QList<uint>::iterator dict_it = first_word_positions.begin(); 0445 0446 while (dict_it != first_word_positions.end()) { 0447 if (next_word_it.indexOf(*dict_it + 1) != -1) { 0448 (*dict_it)++; 0449 ++dict_it; 0450 } else { 0451 dict_it = first_word_positions.erase(dict_it); 0452 } 0453 } 0454 } 0455 } 0456 0457 return !first_word_positions.isEmpty(); 0458 } 0459 0460 };