File indexing completed on 2024-12-15 04:13:43
0001 /* 0002 Kchmviewer - a CHM and EPUB file viewer with broad language support 0003 SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com 0004 0005 SPDX-License-Identifier: GPL-3.0-or-later 0006 */ 0007 0008 #include <QDebug> 0009 #include <QFile> 0010 #include <QVector> 0011 0012 #include "ebook_chm.h" 0013 #include "ebook_chm_encoding.h" 0014 0015 #include "bitfiddle.h" 0016 0017 // Big-enough buffer size for use with various routines. 0018 #define BUF_SIZE 4096 0019 #define COMMON_BUF_LEN 1025 0020 0021 #define TOPICS_ENTRY_LEN 16 0022 #define URLTBL_ENTRY_LEN 12 0023 0024 //#define DEBUGPARSER(A) qDebug A 0025 #define DEBUGPARSER(A) 0026 0027 #define URL_SCHEME_CHM QStringLiteral("ms-its") 0028 0029 EBook_CHM::EBook_CHM() 0030 : EBook() 0031 , m_chmFile(nullptr) 0032 , m_detectedLCID(0) 0033 , m_textCodec(nullptr) 0034 , m_textCodecForSpecialFiles(nullptr) 0035 , m_currentEncoding(QStringLiteral("UTF-8")) 0036 , m_envOptions(QString::fromLatin1(qgetenv("KCHMVIEWEROPTS"))) 0037 { 0038 } 0039 0040 EBook_CHM::~EBook_CHM() 0041 { 0042 close(); 0043 } 0044 0045 void EBook_CHM::close() 0046 { 0047 if (m_chmFile == nullptr) { 0048 return; 0049 } 0050 0051 chm_close(m_chmFile); 0052 0053 m_chmFile = nullptr; 0054 m_filename = m_font = QString(); 0055 0056 m_home.clear(); 0057 m_topicsFile.clear(); 0058 m_indexFile.clear(); 0059 0060 m_textCodec = nullptr; 0061 m_textCodecForSpecialFiles = nullptr; 0062 m_detectedLCID = 0; 0063 m_currentEncoding = QStringLiteral("UTF-8"); 0064 } 0065 0066 QString EBook_CHM::title() const 0067 { 0068 return encodeWithCurrentCodec(m_title); 0069 } 0070 0071 QUrl EBook_CHM::homeUrl() const 0072 { 0073 return pathToUrl(QString::fromUtf8(m_home)); 0074 } 0075 0076 bool EBook_CHM::hasFeature(EBook::Feature code) const 0077 { 0078 switch (code) { 0079 case FEATURE_TOC: 0080 return m_tocAvailable; 0081 0082 case FEATURE_INDEX: 0083 return m_indexAvailable; 0084 0085 case FEATURE_ENCODING: 0086 return true; 0087 } 0088 0089 return false; 0090 } 0091 0092 bool EBook_CHM::getTableOfContents(QList<EBookTocEntry> &toc) const 0093 { 0094 if (parseBinaryTOC(toc)) { 0095 return true; 0096 } 0097 0098 // Parse the plain text TOC 0099 QList<ParsedEntry> parsed; 0100 0101 if (!parseFileAndFillArray(QString::fromUtf8(m_topicsFile), parsed, false)) { 0102 return false; 0103 } 0104 0105 // Find out the root offset, and reduce the indent level to it 0106 // so the toc starts from zero offset. 0107 int root_offset = -1; 0108 0109 // Fill up the real toc 0110 toc.reserve(parsed.size()); 0111 for (const ParsedEntry &e : std::as_const(parsed)) { 0112 if (root_offset == -1) { 0113 root_offset = e.indent; 0114 } 0115 0116 EBookTocEntry entry; 0117 entry.iconid = (EBookTocEntry::Icon)e.iconid; 0118 entry.indent = e.indent - root_offset; 0119 entry.name = e.name; 0120 0121 if (!e.urls.empty()) { 0122 entry.url = e.urls[0]; 0123 } 0124 0125 toc.append(entry); 0126 } 0127 0128 return true; 0129 } 0130 0131 bool EBook_CHM::getIndex(QList<EBookIndexEntry> &index) const 0132 { 0133 // Parse the plain text index 0134 QList<ParsedEntry> parsed; 0135 0136 if (!parseFileAndFillArray(QString::fromUtf8(m_indexFile), parsed, true)) { 0137 return false; 0138 } 0139 0140 // Find out the root offset, and reduce the indent level to it 0141 // so the index starts from zero offset. 0142 int root_offset = 0; 0143 0144 // Fill up the real index 0145 index.reserve(parsed.size()); 0146 0147 // Find the index root offset 0148 const QList<ParsedEntry> &parsedList = parsed; 0149 for (const ParsedEntry &e : parsedList) { 0150 if (e.urls.empty()) { 0151 continue; 0152 } 0153 0154 root_offset = qMin(root_offset, e.indent); 0155 } 0156 0157 // And apply the index 0158 for (const ParsedEntry &e : parsedList) { 0159 if (e.urls.empty()) { 0160 continue; 0161 } 0162 0163 EBookIndexEntry entry; 0164 entry.name = e.name; 0165 entry.urls = e.urls; 0166 entry.seealso = e.seealso; 0167 0168 // If the index array is empty, make sure the first entry is on root offset 0169 if (index.isEmpty()) { 0170 entry.indent = root_offset; 0171 } else { 0172 entry.indent = e.indent - root_offset; 0173 } 0174 0175 index.append(entry); 0176 printf("%d: %s\n", entry.indent, qPrintable(entry.name)); 0177 } 0178 0179 return true; 0180 } 0181 0182 bool EBook_CHM::getFileContentAsString(QString &str, const QUrl &url) const 0183 { 0184 return getTextContent(str, urlToPath(url)); 0185 } 0186 0187 bool EBook_CHM::getFileContentAsBinary(QByteArray &data, const QUrl &url) const 0188 { 0189 return getBinaryContent(data, urlToPath(url)); 0190 } 0191 0192 bool EBook_CHM::getBinaryContent(QByteArray &data, const QString &url) const 0193 { 0194 chmUnitInfo ui; 0195 0196 if (!ResolveObject(url, &ui)) { 0197 return false; 0198 } 0199 0200 data.resize(ui.length); 0201 0202 if (RetrieveObject(&ui, (unsigned char *)data.data(), 0, ui.length)) { 0203 return true; 0204 } 0205 0206 return false; 0207 } 0208 0209 bool EBook_CHM::getTextContent(QString &str, const QString &url, bool internal_encoding) const 0210 { 0211 QByteArray buf; 0212 0213 if (getBinaryContent(buf, url)) { 0214 unsigned int length = buf.size(); 0215 0216 if (length > 0) { 0217 buf.resize(length + 1); 0218 buf[length] = '\0'; 0219 0220 str = internal_encoding ? QString::fromUtf8(buf.constData()) : encodeWithCurrentCodec(buf.constData()); 0221 return true; 0222 } 0223 } 0224 0225 return false; 0226 } 0227 0228 int EBook_CHM::getContentSize(const QString &url) 0229 { 0230 chmUnitInfo ui; 0231 0232 if (!ResolveObject(url, &ui)) { 0233 return -1; 0234 } 0235 0236 return ui.length; 0237 } 0238 0239 bool EBook_CHM::load(const QString &archiveName) 0240 { 0241 QString filename; 0242 0243 // If the file has a file:// prefix, remove it 0244 if (archiveName.startsWith(QLatin1String("file://"))) { 0245 filename = archiveName.mid(7); // strip it 0246 } else { 0247 filename = archiveName; 0248 } 0249 0250 if (m_chmFile) { 0251 close(); 0252 } 0253 0254 #if defined(WIN32) 0255 m_chmFile = chm_open((BSTR)QFile::encodeName(filename).constData()); 0256 #else 0257 m_chmFile = chm_open(QFile::encodeName(filename).constData()); 0258 #endif 0259 0260 if (m_chmFile == nullptr) { 0261 return false; 0262 } 0263 0264 m_filename = filename; 0265 0266 // Reset encoding 0267 m_textCodec = nullptr; 0268 m_textCodecForSpecialFiles = nullptr; 0269 m_currentEncoding = QStringLiteral("UTF-8"); 0270 0271 // Get information from /#WINDOWS and /#SYSTEM files (encoding, title, context file and so) 0272 // and guess the encoding 0273 getInfoFromWindows(); 0274 getInfoFromSystem(); 0275 guessTextEncoding(); 0276 0277 // Check whether the search tables are present 0278 if (ResolveObject(QStringLiteral("/#TOPICS"), &m_chmTOPICS) && ResolveObject(QStringLiteral("/#STRINGS"), &m_chmSTRINGS) && ResolveObject(QStringLiteral("/#URLTBL"), &m_chmURLTBL) && 0279 ResolveObject(QStringLiteral("/#URLSTR"), &m_chmURLSTR)) { 0280 m_lookupTablesValid = true; 0281 fillTopicsUrlMap(); 0282 } else { 0283 m_lookupTablesValid = false; 0284 } 0285 0286 // Some CHM files have toc and index files, but do not set the name properly. 0287 // Some heuristics here. 0288 if (m_topicsFile.isEmpty() && hasFile(QStringLiteral("/toc.hhc"))) { 0289 m_topicsFile = "/toc.hhc"; 0290 } 0291 0292 if (m_indexFile.isEmpty() && hasFile(QStringLiteral("/index.hhk"))) { 0293 m_indexFile = "/index.hhk"; 0294 } 0295 0296 if (!m_topicsFile.isEmpty() || (m_lookupTablesValid && hasFile(QStringLiteral("/#TOCIDX")))) { 0297 m_tocAvailable = true; 0298 } else { 0299 m_tocAvailable = false; 0300 } 0301 0302 if (!m_indexFile.isEmpty() || (m_lookupTablesValid && hasFile(QStringLiteral("/$WWKeywordLinks/BTree")))) { 0303 m_indexAvailable = true; 0304 } else { 0305 m_indexAvailable = false; 0306 } 0307 0308 return true; 0309 } 0310 0311 int EBook_CHM::findStringInQuotes(const QString &tag, int offset, QString &value, bool firstquote, bool decodeentities) const 0312 { 0313 int qbegin = tag.indexOf(QLatin1Char('"'), offset); 0314 0315 if (qbegin == -1) { 0316 qFatal("EBook_CHMImpl::findStringInQuotes: cannot find first quote in <param> tag: '%s'", qPrintable(tag)); 0317 } 0318 0319 int qend = firstquote ? tag.indexOf(QLatin1Char('"'), qbegin + 1) : tag.lastIndexOf(QLatin1Char('"')); 0320 0321 if (qend == -1 || qend <= qbegin) { 0322 qFatal("EBook_CHMImpl::findStringInQuotes: cannot find last quote in <param> tag: '%s'", qPrintable(tag)); 0323 } 0324 0325 // If we do not need to decode HTML entities, just return. 0326 if (decodeentities) { 0327 QString htmlentity = QString(); 0328 bool fill_entity = false; 0329 0330 value.reserve(qend - qbegin); // to avoid multiple memory allocations 0331 0332 for (int i = qbegin + 1; i < qend; i++) { 0333 if (!fill_entity) { 0334 if (tag[i] == QLatin1Char('&')) { // HTML entity starts 0335 fill_entity = true; 0336 } else { 0337 value.append(tag[i]); 0338 } 0339 } else { 0340 if (tag[i] == QLatin1Char(';')) // HTML entity ends 0341 { 0342 // If entity is an ASCII code, just decode it 0343 QString decode = m_htmlEntityDecoder.decode(htmlentity); 0344 0345 if (decode.isNull()) { 0346 break; 0347 } 0348 0349 value.append(decode); 0350 htmlentity = QString(); 0351 fill_entity = false; 0352 } else { 0353 htmlentity.append(tag[i]); 0354 } 0355 } 0356 } 0357 } else { 0358 value = tag.mid(qbegin + 1, qend - qbegin - 1); 0359 } 0360 0361 return qend + 1; 0362 } 0363 0364 bool EBook_CHM::parseFileAndFillArray(const QString &file, QList<ParsedEntry> &data, bool asIndex) const 0365 { 0366 QString src; 0367 const int MAX_NEST_DEPTH = 256; 0368 0369 if (!getTextContent(src, file) || src.isEmpty()) { 0370 return false; 0371 } 0372 0373 /* 0374 // Save the index for debugging purposes 0375 QFile outfile( "parsed.htm" ); 0376 0377 if ( outfile.open( QIODevice::WriteOnly ) ) 0378 { 0379 QTextStream textstream( &outfile ); 0380 textstream << src; 0381 outfile.close(); 0382 } 0383 */ 0384 0385 EBookTocEntry::Icon defaultimagenum = EBookTocEntry::IMAGE_AUTO; 0386 int pos = 0, indent = 0, root_indent_offset = 0; 0387 bool in_object = false, root_indent_offset_set = false; 0388 0389 ParsedEntry entry; 0390 entry.iconid = defaultimagenum; 0391 0392 // Split the HHC file by HTML tags 0393 int stringlen = src.length(); 0394 0395 while (pos < stringlen && (pos = src.indexOf(QLatin1Char('<'), pos)) != -1) { 0396 int i, word_end = 0; 0397 0398 for (i = ++pos; i < stringlen; i++) { 0399 // If a " or ' is found, skip to the next one. 0400 if ((src[i] == QLatin1Char('"') || src[i] == QLatin1Char('\''))) { 0401 // find where quote ends, either by another quote, or by '>' symbol (some people don't know HTML) 0402 int nextpos = src.indexOf(src[i], i + 1); 0403 if (nextpos == -1 && (nextpos = src.indexOf(QLatin1Char('>'), i + 1)) == -1) { 0404 qWarning("EBook_CHMImpl::ParseHhcAndFillTree: corrupted TOC: %s", qPrintable(src.mid(i))); 0405 return false; 0406 } 0407 0408 i = nextpos; 0409 } else if (src[i] == QLatin1Char('>')) { 0410 break; 0411 } else if (!src[i].isLetterOrNumber() && src[i] != QLatin1Char('/') && !word_end) { 0412 word_end = i; 0413 } 0414 } 0415 0416 QString tagword, tag = src.mid(pos, i - pos); 0417 0418 if (word_end) { 0419 tagword = src.mid(pos, word_end - pos).toLower(); 0420 } else { 0421 tagword = tag.toLower(); 0422 } 0423 0424 // DEBUGPARSER(("tag: '%s', tagword: '%s'\n", qPrintable( tag ), qPrintable( tagword ) )); 0425 0426 // <OBJECT type="text/sitemap"> - a topic entry 0427 if (tagword == QLatin1String("object") && tag.indexOf(QLatin1String("text/sitemap"), 0, Qt::CaseInsensitive) != -1) { 0428 in_object = true; 0429 } else if (tagword == QLatin1String("/object") && in_object) { 0430 // a topic entry closed. Add a tree item 0431 if (entry.name.isEmpty() && entry.urls.isEmpty()) { 0432 qWarning("EBook_CHMImpl::ParseAndFillTopicsTree: <object> tag is parsed, but both name and url are empty."); 0433 } else { 0434 // If the name is empty, use the URL as name 0435 if (entry.name.isEmpty()) { 0436 entry.name = entry.urls[0].toString(); 0437 } 0438 0439 if (!root_indent_offset_set) { 0440 root_indent_offset_set = true; 0441 root_indent_offset = indent; 0442 0443 if (root_indent_offset > 1) { 0444 qWarning("CHM has improper index; root indent offset is %d", root_indent_offset); 0445 } 0446 } 0447 0448 // Trim the entry name 0449 entry.name = entry.name.trimmed(); 0450 0451 int real_indent = indent - root_indent_offset; 0452 0453 entry.indent = real_indent; 0454 data.push_back(entry); 0455 } 0456 0457 entry.name = QString(); 0458 entry.urls.clear(); 0459 entry.iconid = defaultimagenum; 0460 entry.seealso.clear(); 0461 in_object = false; 0462 } else if (tagword == QLatin1String("param") && in_object) { 0463 // <param name="Name" value="First Page"> 0464 int offset; // strlen("param ") 0465 const QString name_pattern = QStringLiteral("name="), value_pattern = QStringLiteral("value="); 0466 QString pname, pvalue; 0467 0468 if ((offset = tag.indexOf(name_pattern, 0, Qt::CaseInsensitive)) == -1) { 0469 qFatal("EBook_CHMImpl::ParseAndFillTopicsTree: bad <param> tag '%s': no name=\n", qPrintable(tag)); 0470 } 0471 0472 // offset+5 skips 'name=' 0473 offset = findStringInQuotes(tag, offset + name_pattern.length(), pname, true, false); 0474 pname = pname.toLower(); 0475 0476 if ((offset = tag.indexOf(value_pattern, offset, Qt::CaseInsensitive)) == -1) { 0477 qFatal("EBook_CHMImpl::ParseAndFillTopicsTree: bad <param> tag '%s': no value=\n", qPrintable(tag)); 0478 } 0479 0480 // offset+6 skips 'value=' 0481 findStringInQuotes(tag, offset + value_pattern.length(), pvalue, false, true); 0482 0483 // DEBUGPARSER(("<param>: name '%s', value '%s'", qPrintable( pname ), qPrintable( pvalue ))); 0484 0485 if (pname == QLatin1String("name") || pname == QLatin1String("keyword")) { 0486 // Some help files contain duplicate names, where the second name is empty. Work it around by keeping the first one 0487 if (!pvalue.isEmpty()) { 0488 entry.name = pvalue; 0489 } 0490 } else if (pname == QLatin1String("merge")) { 0491 // MERGE implementation is experimental 0492 QUrl mergeurl = pathToUrl(pvalue); 0493 QString mergecontent; 0494 0495 if (getFileContentAsString(mergecontent, mergeurl) && !mergecontent.isEmpty()) { 0496 qWarning("MERGE is used in index; the implementation is experimental. Please let me know if it works"); 0497 0498 // Merge the read value into the current parsed file. 0499 // To save memory it is done in a kinda hacky way: 0500 src = mergecontent + src.mid(i); 0501 pos = 0; 0502 stringlen = src.length(); 0503 } else { 0504 qWarning("MERGE is used in index but file %s was not found in CHM archive", qPrintable(pvalue)); 0505 } 0506 } else if (pname == QLatin1String("local")) { 0507 // Check for URL duplication 0508 QUrl url = pathToUrl(pvalue); 0509 0510 if (!entry.urls.contains(url)) { 0511 entry.urls.push_back(url); 0512 } 0513 } else if (pname == QLatin1String("see also") && asIndex && entry.name != pvalue) { 0514 entry.urls.push_back(QUrl(QStringLiteral("seealso"))); 0515 entry.seealso = pvalue; 0516 } else if (pname == QLatin1String("imagenumber")) { 0517 bool bok; 0518 int imgnum = pvalue.toInt(&bok); 0519 0520 if (bok && imgnum >= 0 && imgnum < EBookTocEntry::MAX_BUILTIN_ICONS) { 0521 entry.iconid = (EBookTocEntry::Icon)imgnum; 0522 } 0523 } 0524 } else if (tagword == QLatin1String("ul")) // increase indent level 0525 { 0526 // Fix for buggy help files 0527 if (++indent >= MAX_NEST_DEPTH) { 0528 qFatal("EBook_CHMImpl::ParseAndFillTopicsTree: max nest depth (%d) is reached, error in help file", MAX_NEST_DEPTH); 0529 } 0530 0531 DEBUGPARSER(("<ul>: new intent is %d\n", indent - root_indent_offset)); 0532 } else if (tagword == QLatin1String("/ul")) // decrease indent level 0533 { 0534 if (--indent < root_indent_offset) { 0535 indent = root_indent_offset; 0536 } 0537 0538 DEBUGPARSER(("</ul>: new intent is %d\n", indent - root_indent_offset)); 0539 } 0540 0541 pos = i; 0542 } 0543 0544 // Dump our array 0545 // for ( int i = 0; i < data.size(); i++ ) 0546 // qDebug() << data[i].indent << data[i].name << data[i].urls; 0547 0548 return true; 0549 } 0550 0551 bool EBook_CHM::ResolveObject(const QString &fileName, chmUnitInfo *ui) const 0552 { 0553 return m_chmFile != nullptr && ::chm_resolve_object(m_chmFile, qPrintable(fileName), ui) == CHM_RESOLVE_SUCCESS; 0554 } 0555 0556 bool EBook_CHM::hasFile(const QString &fileName) const 0557 { 0558 chmUnitInfo ui; 0559 0560 return m_chmFile != nullptr && ::chm_resolve_object(m_chmFile, qPrintable(fileName), &ui) == CHM_RESOLVE_SUCCESS; 0561 } 0562 0563 size_t EBook_CHM::RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, LONGUINT64 fileOffset, LONGINT64 bufferSize) const 0564 { 0565 return ::chm_retrieve_object(m_chmFile, const_cast<chmUnitInfo *>(ui), buffer, fileOffset, bufferSize); 0566 } 0567 0568 bool EBook_CHM::getInfoFromWindows() 0569 { 0570 #define WIN_HEADER_LEN 0x08 0571 unsigned char buffer[BUF_SIZE]; 0572 unsigned int factor; 0573 chmUnitInfo ui; 0574 long size = 0; 0575 0576 if (ResolveObject(QStringLiteral("/#WINDOWS"), &ui)) { 0577 if (!RetrieveObject(&ui, buffer, 0, WIN_HEADER_LEN)) { 0578 return false; 0579 } 0580 0581 unsigned int entries = get_int32_le(reinterpret_cast<unsigned int *>(buffer)); 0582 unsigned int entry_size = get_int32_le(reinterpret_cast<unsigned int *>(buffer + 0x04)); 0583 0584 QVector<unsigned char> uptr(entries * entry_size); 0585 unsigned char *raw = (unsigned char *)uptr.data(); 0586 0587 if (!RetrieveObject(&ui, raw, 8, entries * entry_size)) { 0588 return false; 0589 } 0590 0591 if (!ResolveObject(QStringLiteral("/#STRINGS"), &ui)) { 0592 return false; 0593 } 0594 0595 for (unsigned int i = 0; i < entries; ++i) { 0596 unsigned int offset = i * entry_size; 0597 0598 unsigned int off_title = get_int32_le(reinterpret_cast<unsigned int *>(raw + offset + 0x14)); 0599 unsigned int off_home = get_int32_le(reinterpret_cast<unsigned int *>(raw + offset + 0x68)); 0600 unsigned int off_hhc = get_int32_le(reinterpret_cast<unsigned int *>(raw + offset + 0x60)); 0601 unsigned int off_hhk = get_int32_le(reinterpret_cast<unsigned int *>(raw + offset + 0x64)); 0602 0603 factor = off_title / 4096; 0604 0605 if (size == 0) { 0606 size = RetrieveObject(&ui, buffer, factor * 4096, BUF_SIZE); 0607 } 0608 0609 if (size && off_title) { 0610 m_title = QByteArray((const char *)(buffer + off_title % 4096)); 0611 } 0612 0613 if (factor != off_home / 4096) { 0614 factor = off_home / 4096; 0615 size = RetrieveObject(&ui, buffer, factor * 4096, BUF_SIZE); 0616 } 0617 0618 if (size && off_home) { 0619 m_home = QByteArray("/") + QByteArray((const char *)buffer + off_home % 4096); 0620 } 0621 0622 if (factor != off_hhc / 4096) { 0623 factor = off_hhc / 4096; 0624 size = RetrieveObject(&ui, buffer, factor * 4096, BUF_SIZE); 0625 } 0626 0627 if (size && off_hhc) { 0628 m_topicsFile = QByteArray("/") + QByteArray((const char *)buffer + off_hhc % 4096); 0629 } 0630 0631 if (factor != off_hhk / 4096) { 0632 factor = off_hhk / 4096; 0633 size = RetrieveObject(&ui, buffer, factor * 4096, BUF_SIZE); 0634 } 0635 0636 if (size && off_hhk) { 0637 m_indexFile = QByteArray("/") + QByteArray((const char *)buffer + off_hhk % 4096); 0638 } 0639 } 0640 } 0641 return true; 0642 } 0643 0644 bool EBook_CHM::getInfoFromSystem() 0645 { 0646 unsigned char buffer[BUF_SIZE]; 0647 chmUnitInfo ui; 0648 0649 int index = 0; 0650 unsigned char *cursor = nullptr, *p; 0651 unsigned short value = 0; 0652 long size = 0; 0653 0654 // Run the first loop to detect the encoding. We need this, because title could be 0655 // already encoded in user encoding. Same for file names 0656 if (!ResolveObject(QStringLiteral("/#SYSTEM"), &ui)) { 0657 return false; 0658 } 0659 0660 // Can we pull BUFF_SIZE bytes of the #SYSTEM file? 0661 if ((size = RetrieveObject(&ui, buffer, 4, BUF_SIZE)) == 0) { 0662 return false; 0663 } 0664 0665 buffer[size - 1] = 0; 0666 0667 // First loop to detect the encoding 0668 for (index = 0; index < (size - 1 - (long)sizeof(unsigned short));) { 0669 cursor = buffer + index; 0670 value = UINT16ARRAY(cursor); 0671 0672 switch (value) { 0673 case 0: 0674 index += 2; 0675 cursor = buffer + index; 0676 0677 if (m_topicsFile.isEmpty()) { 0678 m_topicsFile = QByteArray("/") + QByteArray((const char *)buffer + index + 2); 0679 } 0680 0681 break; 0682 0683 case 1: 0684 index += 2; 0685 cursor = buffer + index; 0686 0687 if (m_indexFile.isEmpty()) { 0688 m_indexFile = QByteArray("/") + QByteArray((const char *)buffer + index + 2); 0689 } 0690 break; 0691 0692 case 2: 0693 index += 2; 0694 cursor = buffer + index; 0695 0696 if (m_home.isEmpty() || m_home == "/") { 0697 m_home = QByteArray("/") + QByteArray((const char *)buffer + index + 2); 0698 } 0699 break; 0700 0701 case 3: 0702 index += 2; 0703 cursor = buffer + index; 0704 m_title = QByteArray((const char *)(buffer + index + 2)); 0705 break; 0706 0707 case 4: 0708 index += 2; 0709 cursor = buffer + index; 0710 0711 p = buffer + index + 2; 0712 m_detectedLCID = (short)(p[0] | (p[1] << 8)); 0713 0714 break; 0715 0716 case 6: 0717 index += 2; 0718 cursor = buffer + index; 0719 0720 if (m_topicsFile.isEmpty()) { 0721 QString topicAttempt = QStringLiteral("/"); 0722 topicAttempt += QString(QString::fromUtf8((const char *)buffer + index + 2)); 0723 0724 QString tmp = topicAttempt + QStringLiteral(".hhc"); 0725 0726 if (ResolveObject(tmp, &ui)) { 0727 m_topicsFile = qPrintable(tmp); 0728 } 0729 0730 tmp = topicAttempt + QStringLiteral(".hhk"); 0731 0732 if (ResolveObject(tmp, &ui)) { 0733 m_indexFile = qPrintable(tmp); 0734 } 0735 } 0736 break; 0737 0738 case 16: 0739 index += 2; 0740 cursor = buffer + index; 0741 0742 m_font = QString(QString::fromUtf8((const char *)buffer + index + 2)); 0743 break; 0744 0745 default: 0746 index += 2; 0747 cursor = buffer + index; 0748 } 0749 0750 value = UINT16ARRAY(cursor); 0751 index += value + 2; 0752 } 0753 0754 return true; 0755 } 0756 0757 QString EBook_CHM::getTopicByUrl(const QUrl &url) 0758 { 0759 QMap<QUrl, QString>::const_iterator it = m_url2topics.constFind(url); 0760 0761 if (it == m_url2topics.constEnd()) { 0762 return QString(); 0763 } 0764 0765 return it.value(); 0766 } 0767 0768 static int chm_enumerator_callback(struct chmFile *, struct chmUnitInfo *ui, void *context) 0769 { 0770 EBook_CHM tmp; 0771 ((QList<QUrl> *)context)->push_back(tmp.pathToUrl(QString::fromUtf8(ui->path))); 0772 return CHM_ENUMERATOR_CONTINUE; 0773 } 0774 0775 bool EBook_CHM::enumerateFiles(QList<QUrl> &files) 0776 { 0777 files.clear(); 0778 return chm_enumerate(m_chmFile, CHM_ENUMERATE_ALL, chm_enumerator_callback, &files); 0779 } 0780 0781 QString EBook_CHM::currentEncoding() const 0782 { 0783 return m_currentEncoding; 0784 } 0785 0786 bool EBook_CHM::setCurrentEncoding(const char *encoding) 0787 { 0788 m_currentEncoding = QString::fromUtf8(encoding); 0789 return changeFileEncoding(m_currentEncoding); 0790 } 0791 0792 bool EBook_CHM::isSupportedUrl(const QUrl &url) 0793 { 0794 return url.scheme() == URL_SCHEME_CHM; 0795 } 0796 0797 bool EBook_CHM::guessTextEncoding() 0798 { 0799 if (!m_detectedLCID) { 0800 qWarning("Could not detect LCID"); 0801 return false; 0802 } 0803 0804 QString enc = Ebook_CHM_Encoding::guessByLCID(m_detectedLCID); 0805 0806 if (changeFileEncoding(enc)) { 0807 m_currentEncoding = enc; 0808 return true; 0809 } 0810 0811 return false; 0812 } 0813 0814 bool EBook_CHM::changeFileEncoding(const QString &qtencoding) 0815 { 0816 // Encoding could be either simple Qt codepage, or set like CP1251/KOI8, which allows to 0817 // set up encodings separately for text (first) and internal files (second) 0818 int p = qtencoding.indexOf(QLatin1Char('/')); 0819 0820 if (p != -1) { 0821 QString global = qtencoding.left(p); 0822 QString special = qtencoding.mid(p + 1); 0823 0824 m_textCodec = QTextCodec::codecForName(global.toUtf8()); 0825 0826 if (!m_textCodec) { 0827 qWarning("Could not set up Text Codec for encoding '%s'", qPrintable(global)); 0828 return false; 0829 } 0830 0831 m_textCodecForSpecialFiles = QTextCodec::codecForName(special.toUtf8()); 0832 0833 if (!m_textCodecForSpecialFiles) { 0834 qWarning("Could not set up Text Codec for encoding '%s'", qPrintable(special)); 0835 return false; 0836 } 0837 } else { 0838 m_textCodecForSpecialFiles = m_textCodec = QTextCodec::codecForName(qtencoding.toUtf8()); 0839 0840 if (!m_textCodec) { 0841 qWarning("Could not set up Text Codec for encoding '%s'", qPrintable(qtencoding)); 0842 return false; 0843 } 0844 } 0845 0846 m_htmlEntityDecoder.changeEncoding(m_textCodec); 0847 return true; 0848 } 0849 0850 void EBook_CHM::fillTopicsUrlMap() 0851 { 0852 if (!m_lookupTablesValid) { 0853 return; 0854 } 0855 0856 // Read those tables 0857 QVector<unsigned char> topics(m_chmTOPICS.length), urltbl(m_chmURLTBL.length), urlstr(m_chmURLSTR.length), strings(m_chmSTRINGS.length); 0858 0859 if (!RetrieveObject(&m_chmTOPICS, (unsigned char *)topics.data(), 0, m_chmTOPICS.length) || !RetrieveObject(&m_chmURLTBL, (unsigned char *)urltbl.data(), 0, m_chmURLTBL.length) || 0860 !RetrieveObject(&m_chmURLSTR, (unsigned char *)urlstr.data(), 0, m_chmURLSTR.length) || !RetrieveObject(&m_chmSTRINGS, (unsigned char *)strings.data(), 0, m_chmSTRINGS.length)) { 0861 return; 0862 } 0863 0864 for (LONGUINT64 i = 0; i < m_chmTOPICS.length; i += TOPICS_ENTRY_LEN) { 0865 unsigned int off_title = get_int32_le(reinterpret_cast<unsigned int *>(topics.data() + i + 4)); 0866 unsigned int off_url = get_int32_le(reinterpret_cast<unsigned int *>(topics.data() + i + 8)); 0867 off_url = get_int32_le(reinterpret_cast<unsigned int *>(urltbl.data() + off_url + 8)) + 8; 0868 0869 QUrl url = pathToUrl(QString::fromUtf8((const char *)urlstr.data() + off_url)); 0870 0871 if (off_title < (unsigned int)strings.size()) { 0872 m_url2topics[url] = encodeWithCurrentCodec((const char *)strings.data() + off_title); 0873 } else { 0874 m_url2topics[url] = QStringLiteral("Untitled"); 0875 } 0876 } 0877 } 0878 0879 bool EBook_CHM::parseBinaryTOC(QList<EBookTocEntry> &toc) const 0880 { 0881 if (!m_lookupTablesValid) { 0882 return false; 0883 } 0884 0885 QByteArray tocidx, topics, urltbl, urlstr, strings; 0886 0887 // Read the index tables 0888 if (!getBinaryContent(tocidx, QStringLiteral("/#TOCIDX")) || !getBinaryContent(topics, QStringLiteral("/#TOPICS")) || !getBinaryContent(urltbl, QStringLiteral("/#URLTBL")) || !getBinaryContent(urlstr, QStringLiteral("/#URLSTR")) || 0889 !getBinaryContent(strings, QStringLiteral("/#STRINGS"))) { 0890 return false; 0891 } 0892 0893 // Shamelessly stolen from xchm 0894 if (!RecurseLoadBTOC(tocidx, topics, urltbl, urlstr, strings, UINT32ARRAY(tocidx.data()), toc, 0)) { 0895 qWarning("Failed to parse binary TOC, fallback to text-based TOC"); 0896 toc.clear(); 0897 return false; 0898 } 0899 0900 return true; 0901 } 0902 0903 // 0904 // This piece of code was based on the one in xchm written by Razvan Cojocaru <razvanco@gmx.net> 0905 // 0906 bool EBook_CHM::RecurseLoadBTOC(const QByteArray &tocidx, const QByteArray &topics, const QByteArray &urltbl, const QByteArray &urlstr, const QByteArray &strings, int offset, QList<EBookTocEntry> &entries, int level) const 0907 { 0908 while (offset) { 0909 // If this is end of TOCIDX, return. 0910 if (tocidx.size() < offset + 20) { 0911 return true; 0912 } 0913 0914 unsigned int flags = UINT32ARRAY(tocidx.data() + offset + 4); 0915 int index = UINT32ARRAY(tocidx.data() + offset + 8); 0916 0917 if ((flags & 0x04) || (flags & 0x08)) { 0918 QString name, value; 0919 0920 if ((flags & 0x08) == 0) { 0921 if (strings.size() < index + 1) { 0922 qWarning("EBook_CHM::RecurseLoadBTOC: invalid name index (%d) for book TOC entry!", index); 0923 return false; 0924 } 0925 0926 name = encodeWithCurrentCodec(strings.data() + index); 0927 } else { 0928 if (topics.size() < (index * 16) + 12) { 0929 qWarning("EBook_CHM::RecurseLoadBTOC: invalid name index (%d) for local TOC entry!", index); 0930 return false; 0931 } 0932 0933 int tocoffset = (int)UINT32ARRAY(topics.data() + (index * 16) + 4); 0934 0935 if (strings.size() < tocoffset + 1) { 0936 qWarning("EBook_CHM::RecurseLoadBTOC: invalid name tocoffset (%d) for TOC entry!", tocoffset); 0937 return false; 0938 } 0939 0940 if (tocoffset < 0) { 0941 name.clear(); 0942 } else { 0943 name = encodeWithCurrentCodec(strings.data() + tocoffset); 0944 } 0945 0946 // #URLTBL index 0947 tocoffset = (int)UINT32ARRAY(topics.data() + (index * 16) + 8); 0948 0949 if (tocoffset < 0 || urltbl.size() < tocoffset + 12) { 0950 qWarning("EBook_CHM::RecurseLoadBTOC: invalid url index (%d) for TOC entry!", tocoffset); 0951 return false; 0952 } 0953 0954 tocoffset = (int)UINT32ARRAY(urltbl.data() + tocoffset + 8); 0955 0956 if (tocoffset < 0 || urlstr.size() < tocoffset) { 0957 qWarning("EBook_CHM::RecurseLoadBTOC: invalid url offset (%d) for TOC entry!", tocoffset); 0958 return false; 0959 } 0960 0961 value = encodeWithCurrentCodec(urlstr.data() + tocoffset + 8); 0962 } 0963 0964 EBookTocEntry entry; 0965 entry.name = name.trimmed(); 0966 0967 if (!entry.name.isEmpty()) { 0968 if (!value.isEmpty()) { 0969 entry.url = pathToUrl(value); 0970 } 0971 0972 entry.iconid = EBookTocEntry::IMAGE_AUTO; 0973 entry.indent = level; 0974 entries.push_back(entry); 0975 } 0976 } 0977 0978 if (flags & 0x04) { 0979 // book 0980 if (tocidx.size() < offset + 24) { 0981 qWarning("EBook_CHM::RecurseLoadBTOC: invalid child entry offset (%d)", offset); 0982 return false; 0983 } 0984 0985 unsigned int childoffset = UINT32ARRAY(tocidx.data() + offset + 20); 0986 0987 if (childoffset) { 0988 if (!RecurseLoadBTOC(tocidx, topics, urltbl, urlstr, strings, childoffset, entries, level + 1)) { 0989 return false; 0990 } 0991 } 0992 } 0993 0994 offset = UINT32ARRAY(tocidx.data() + offset + 0x10); 0995 } 0996 0997 return true; 0998 } 0999 1000 bool EBook_CHM::hasOption(const QString &name) const 1001 { 1002 if (!m_envOptions.isEmpty() && m_envOptions.contains(name)) { 1003 return true; 1004 } 1005 1006 return false; 1007 } 1008 1009 QUrl EBook_CHM::pathToUrl(const QString &link) const 1010 { 1011 if (link.startsWith(QLatin1String("http://")) || link.startsWith(QLatin1String("https://"))) { 1012 return QUrl(link); 1013 } 1014 1015 QUrl url; 1016 url.setScheme(URL_SCHEME_CHM); 1017 url.setHost(URL_SCHEME_CHM); 1018 1019 // Does the link contain the fragment as well? 1020 int off = link.indexOf(QLatin1Char('#')); 1021 QString path; 1022 1023 if (off != -1) { 1024 path = link.left(off); 1025 url.setFragment(link.mid(off + 1)); 1026 } else { 1027 path = link; 1028 } 1029 1030 if (!path.startsWith(QLatin1Char('/'))) { 1031 path.prepend(QLatin1Char('/')); 1032 } 1033 1034 url.setPath(QUrl::fromPercentEncoding(path.toUtf8())); 1035 return url; 1036 } 1037 1038 QString EBook_CHM::urlToPath(const QUrl &link) const 1039 { 1040 if (link.scheme() == URL_SCHEME_CHM) { 1041 if (link.path() == QLatin1String("/") || link.path().isEmpty()) { 1042 return QString::fromUtf8(m_home); 1043 } 1044 1045 return link.path(); 1046 } 1047 1048 return QLatin1String(""); 1049 } 1050 1051 EBook_CHM::ParsedEntry::ParsedEntry() 1052 { 1053 iconid = 0; 1054 indent = 0; 1055 }