chm/lib/helper_search_index.cpp

0001 /*
0002     Kchmviewer - a CHM and EPUB file viewer with broad language support
0003     SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com
0004
0005     SPDX-License-Identifier: GPL-3.0-or-later
0006 */
0007
0008 #include <QApplication>
0009 #include <QTextCodec>
0010
0011 #include "ebook.h"
0012 #include "ebook_search.h"
0013 #include "helper_search_index.h"
0014
0015 static const int DICT_VERSION = 4;
0016
0017 namespace QtAs
0018 {
0019 // Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
0020 // This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both
0021 // search for "print" and "->print" will find it.
0022 #define SPLIT_CHARACTERS QStringLiteral("!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`")
0023
0024 // Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
0025 #define WORD_CHARACTERS QStringLiteral("$_")
0026
0027 struct Term {
0028     Term()
0029         : frequency(-1)
0030     {
0031     }
0032     Term(const QString &t, int f, const QVector<Document> &l)
0033         : term(t)
0034         , frequency(f)
0035         , documents(l)
0036     {
0037     }
0038     QString term;
0039     int frequency;
0040     QVector<Document> documents;
0041     bool operator<(const Term &i2) const
0042     {
0043         return frequency < i2.frequency;
0044     }
0045 };
0046
0047 QDataStream &operator>>(QDataStream &s, Document &l)
0048 {
0049     s >> l.docNumber;
0050     s >> l.frequency;
0051     return s;
0052 }
0053
0054 QDataStream &operator<<(QDataStream &s, const Document l)
0055 {
0056     s << (short)l.docNumber;
0057     s << (short)l.frequency;
0058     return s;
0059 }
0060
0061 Index::Index()
0062     : QObject(nullptr)
0063 {
0064     lastWindowClosed = false;
0065     connect(qApp, &QGuiApplication::lastWindowClosed, this, &Index::setLastWinClosed);
0066 }
0067
0068 void Index::setLastWinClosed()
0069 {
0070     lastWindowClosed = true;
0071 }
0072
0073 bool Index::makeIndex(const QList<QUrl> &docs, EBook *chmFile)
0074 {
0075     if (docs.isEmpty()) {
0076         return false;
0077     }
0078
0079     docList = docs;
0080
0081     if (chmFile->hasFeature(EBook::FEATURE_ENCODING)) {
0082         entityDecoder.changeEncoding(QTextCodec::codecForName(chmFile->currentEncoding().toUtf8()));
0083     }
0084
0085     QList<QUrl>::ConstIterator it = docList.constBegin();
0086     int steps = docList.count() / 100;
0087
0088     if (!steps) {
0089         steps++;
0090     }
0091
0092     int prog = 0;
0093
0094     for (int i = 0; it != docList.constEnd(); ++it, ++i) {
0095         if (lastWindowClosed) {
0096             return false;
0097         }
0098
0099         const QUrl &filename = *it;
0100         QStringList terms;
0101
0102         if (parseDocumentToStringlist(chmFile, filename, terms)) {
0103             for (QStringList::ConstIterator tit = terms.constBegin(); tit != terms.constEnd(); ++tit) {
0104                 insertInDict(*tit, i);
0105             }
0106         }
0107
0108         if (i % steps == 0) {
0109             prog++;
0110             prog = qMin(prog, 99);
0111             Q_EMIT indexingProgress(prog, tr("Processing document %1").arg((*it).path()));
0112         }
0113     }
0114
0115     Q_EMIT indexingProgress(100, tr("Processing completed"));
0116     return true;
0117 }
0118
0119 void Index::insertInDict(const QString &str, int docNum)
0120 {
0121     Entry *e = nullptr;
0122     if (!dict.isEmpty()) {
0123         e = dict[str];
0124     }
0125
0126     if (e) {
0127         if (e->documents.last().docNumber != docNum) {
0128             e->documents.append(Document(docNum, 1));
0129         } else {
0130             e->documents.last().frequency++;
0131         }
0132     } else {
0133         dict.insert(str, new Entry(docNum));
0134     }
0135 }
0136
0137 bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl &filename, QStringList &tokenlist)
0138 {
0139     QString parsedbuf, parseentity, text;
0140
0141     if (!chmFile->getFileContentAsString(text, filename) || text.isEmpty()) {
0142         qWarning("Search index generator: could not retrieve the document content for %s", qPrintable(filename.toString()));
0143         return false;
0144     }
0145
0146     m_charssplit = SPLIT_CHARACTERS;
0147     m_charsword = WORD_CHARACTERS;
0148
0149     tokenlist.clear();
0150
0151     // State machine states
0152     enum state_t {
0153         STATE_OUTSIDE_TAGS,  // outside HTML tags; parse text
0154         STATE_IN_HTML_TAG,   // inside HTML tags; wait for end tag
0155         STATE_IN_QUOTES,     // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
0156         STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity
0157     };
0158
0159     state_t state = STATE_OUTSIDE_TAGS;
0160     QChar QuoteChar; // used in STATE_IN_QUOTES
0161
0162     for (int j = 0; j < text.length(); j++) {
0163         QChar ch = text[j];
0164
0165         if ((j % 20000) == 0) {
0166             qApp->processEvents(QEventLoop::ExcludeUserInputEvents);
0167         }
0168
0169         if (state == STATE_IN_HTML_TAG) {
0170             // We are inside HTML tag.
0171             // Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
0172             if (ch == QLatin1Char('"') || ch == QLatin1Char('\'')) {
0173                 state = STATE_IN_QUOTES;
0174                 QuoteChar = ch;
0175             } else if (ch == QLatin1Char('>')) {
0176                 state = STATE_OUTSIDE_TAGS;
0177             }
0178
0179             continue;
0180         } else if (state == STATE_IN_QUOTES) {
0181             // We are inside quoted text inside HTML tag.
0182             // Ignore everything until we see the quote character again
0183             if (ch == QuoteChar) {
0184                 state = STATE_IN_HTML_TAG;
0185             }
0186
0187             continue;
0188         } else if (state == STATE_IN_HTML_ENTITY) {
0189             // We are inside encoded HTML entity (like &nbsp;).
0190             // Collect to parsedbuf everything until we see ;
0191             if (ch.isLetterOrNumber()) {
0192                 // get next character of this entity
0193                 parseentity.append(ch);
0194                 continue;
0195             }
0196
0197             // The entity ended
0198             state = STATE_OUTSIDE_TAGS;
0199
0200             // Some shitty HTML does not terminate entities correctly. Screw it.
0201             if (ch != QLatin1Char(';') && ch != QLatin1Char('<')) {
0202                 if (parseentity.isEmpty()) {
0203                     // straight '&' symbol. Add and continue.
0204                     parsedbuf += QLatin1String("&");
0205                 } else {
0206                     qWarning("Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable(parseentity), ch.toLatin1());
0207                 }
0208
0209                 j--; // parse this character again, but in different state
0210                 continue;
0211             }
0212
0213             // Don't we have a space?
0214             if (parseentity.toLower() != QLatin1String("nbsp")) {
0215                 QString entity = entityDecoder.decode(parseentity);
0216
0217                 if (entity.isNull()) {
0218                     // decodeEntity() already printed error message
0219                     // qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
0220                     continue;
0221                 }
0222
0223                 parsedbuf += entity;
0224                 continue;
0225             } else {
0226                 ch = QLatin1Char(' '); // We got a space, so treat it like it, and not add it to parsebuf
0227             }
0228         }
0229
0230         //
0231         // Now process STATE_OUTSIDE_TAGS
0232         //
0233
0234         // Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
0235         if (ch == QLatin1Char('<')) {
0236             state = STATE_IN_HTML_TAG;
0237             goto tokenize_buf;
0238         }
0239
0240         // Check for start of HTML entity
0241         if (ch == QLatin1Char('&')) {
0242             state = STATE_IN_HTML_ENTITY;
0243             parseentity = QString();
0244             continue;
0245         }
0246
0247         // Replace quote by ' - quotes are used in search window to set the phrase
0248         if (ch == QLatin1Char('"')) {
0249             ch = QLatin1Char('\'');
0250         }
0251
0252         // Ok, we have a valid character outside HTML tags, and probably some in buffer already.
0253         // If it is char or letter, add it and continue
0254         if (ch.isLetterOrNumber() || m_charsword.indexOf(ch) != -1) {
0255             parsedbuf.append(ch);
0256             continue;
0257         }
0258
0259         // If it is a split char, add the word to the dictionary, and then add the char itself.
0260         if (m_charssplit.indexOf(ch) != -1) {
0261             if (!parsedbuf.isEmpty()) {
0262                 tokenlist.push_back(parsedbuf.toLower());
0263             }
0264
0265             tokenlist.push_back(ch.toLower());
0266             parsedbuf = QString();
0267             continue;
0268         }
0269
0270     tokenize_buf:
0271         // Just add the word; it is most likely a space or terminated by tokenizer.
0272         if (!parsedbuf.isEmpty()) {
0273             tokenlist.push_back(parsedbuf.toLower());
0274             parsedbuf = QString();
0275         }
0276     }
0277
0278     // Add the last word if still here - for broken htmls.
0279     if (!parsedbuf.isEmpty()) {
0280         tokenlist.push_back(parsedbuf.toLower());
0281     }
0282
0283     return true;
0284 }
0285
0286 void Index::writeDict(QDataStream &stream)
0287 {
0288     stream << DICT_VERSION;
0289     stream << m_charssplit;
0290     stream << m_charsword;
0291
0292     // Document list
0293     stream << docList;
0294
0295     // Dictionary
0296     for (QHash<QString, Entry *>::ConstIterator it = dict.constBegin(); it != dict.constEnd(); ++it) {
0297         stream << it.key();
0298         stream << (int)it.value()->documents.count();
0299         stream << it.value()->documents;
0300     }
0301 }
0302
0303 bool Index::readDict(QDataStream &stream)
0304 {
0305     dict.clear();
0306     docList.clear();
0307
0308     QString key;
0309     int version, numOfDocs;
0310
0311     stream >> version;
0312
0313     if (version < 2) {
0314         return false;
0315     }
0316
0317     stream >> m_charssplit;
0318     stream >> m_charsword;
0319
0320     // Read the document list
0321     stream >> docList;
0322
0323     while (!stream.atEnd()) {
0324         stream >> key;
0325         stream >> numOfDocs;
0326
0327         QVector<Document> docs(numOfDocs);
0328
0329         stream >> docs;
0330         dict.insert(key, new Entry(docs));
0331     }
0332
0333     return dict.size() > 0;
0334 }
0335
0336 QList<QUrl> Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile)
0337 {
0338     QList<Term> termList;
0339
0340     for (const auto &term : terms) {
0341         Entry *e = nullptr;
0342
0343         if (dict[term]) {
0344             e = dict[term];
0345             termList.append(Term(term, e->documents.count(), e->documents));
0346         } else {
0347             return QList<QUrl>();
0348         }
0349     }
0350
0351     if (termList.isEmpty()) {
0352         return QList<QUrl>();
0353     }
0354
0355     std::sort(termList.begin(), termList.end());
0356
0357     QVector<Document> minDocs = termList.takeFirst().documents;
0358     for (const Term &t : std::as_const(termList)) {
0359         const QVector<Document> docs = t.documents;
0360         for (QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end();) {
0361             bool found = false;
0362             for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it) {
0363                 if ((*minDoc_it).docNumber == (*doc_it).docNumber) {
0364                     (*minDoc_it).frequency += (*doc_it).frequency;
0365                     found = true;
0366                     break;
0367                 }
0368             }
0369             if (!found) {
0370                 minDoc_it = minDocs.erase(minDoc_it);
0371             } else {
0372                 ++minDoc_it;
0373             }
0374         }
0375     }
0376
0377     QList<QUrl> results;
0378     std::sort(minDocs.begin(), minDocs.end());
0379     if (termSeq.isEmpty()) {
0380         for (const Document &doc : std::as_const(minDocs)) {
0381             results << docList.at((int)doc.docNumber);
0382         }
0383         return results;
0384     }
0385
0386     QUrl fileName;
0387     for (const Document &doc : std::as_const(minDocs)) {
0388         fileName = docList[(int)doc.docNumber];
0389         if (searchForPhrases(termSeq, seqWords, fileName, chmFile)) {
0390             results << fileName;
0391         }
0392     }
0393
0394     return results;
0395 }
0396
0397 bool Index::searchForPhrases(const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook *chmFile)
0398 {
0399     QStringList parsed_document;
0400
0401     if (!parseDocumentToStringlist(chmFile, filename, parsed_document)) {
0402         return false;
0403     }
0404
0405     miniDict.clear();
0406
0407     // Initialize the dictionary with the words in phrase(s)
0408     for (const QString &word : words) {
0409         miniDict.insert(word, new PosEntry(0));
0410     }
0411
0412     // Fill the dictionary with the words from the document
0413     unsigned int word_offset = 3;
0414     for (QStringList::ConstIterator it = parsed_document.constBegin(); it != parsed_document.constEnd(); it++, word_offset++) {
0415         PosEntry *entry = miniDict[*it];
0416
0417         if (entry) {
0418             entry->positions.append(word_offset);
0419         }
0420     }
0421
0422     // Dump it
0423     /*
0424         QDictIterator<PosEntry> it( miniDict );
0425         for( ; it.current(); ++it )
0426         {
0427             QString text( it.currentKey() );
0428             QValueList<uint> pos = miniDict[text]->positions;
0429             for ( unsigned int i = 1; i < pos.size(); i++ )
0430                 text += " " + QString::number( pos[i] );
0431
0432             qDebug( "%s", text.ascii());
0433         }
0434     */
0435
0436     QList<uint> first_word_positions;
0437
0438     for (QStringList::ConstIterator phrase_it = phrases.constBegin(); phrase_it != phrases.constEnd(); phrase_it++) {
0439         QStringList phrasewords = phrase_it->split(QLatin1Char(' '));
0440         first_word_positions = miniDict[phrasewords[0]]->positions;
0441
0442         for (int j = 1; j < phrasewords.count(); ++j) {
0443             QList<uint> next_word_it = miniDict[phrasewords[j]]->positions;
0444             QList<uint>::iterator dict_it = first_word_positions.begin();
0445
0446             while (dict_it != first_word_positions.end()) {
0447                 if (next_word_it.indexOf(*dict_it + 1) != -1) {
0448                     (*dict_it)++;
0449                     ++dict_it;
0450                 } else {
0451                     dict_it = first_word_positions.erase(dict_it);
0452                 }
0453             }
0454         }
0455     }
0456
0457     return !first_word_positions.isEmpty();
0458 }
0459
0460 };