File indexing completed on 2024-06-16 04:20:01
0001 /* 0002 Kchmviewer - a CHM and EPUB file viewer with broad language support 0003 SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com 0004 0005 SPDX-License-Identifier: GPL-3.0-or-later 0006 */ 0007 0008 #ifndef EBOOK_CHM_H 0009 #define EBOOK_CHM_H 0010 0011 #include <QMap> 0012 #include <QTextCodec> 0013 0014 // Enable Unicode use in libchm 0015 #if defined(WIN32) 0016 #define PPC_BSTR 0017 #endif 0018 #include <chm_lib.h> 0019 0020 #include "ebook.h" 0021 #include "helper_entitydecoder.h" 0022 0023 class EBook_CHM : public EBook 0024 { 0025 public: 0026 EBook_CHM(); 0027 ~EBook_CHM() override; 0028 0029 /*! 0030 * \brief Attempts to load chm file. 0031 * \param archiveName filename. 0032 * \return EBook object on success, NULL on failure. 0033 * 0034 * Loads a CHM file. For CHM files it could internally load more than one file, 0035 * if files linked to this one are present locally (like MSDN). 0036 * \ingroup init 0037 */ 0038 bool load(const QString &archiveName) override; 0039 0040 /*! 0041 * \brief Closes all the files, and frees the appropriate data. 0042 * \ingroup init 0043 */ 0044 void close() override; 0045 0046 /*! 0047 * \brief Gets the title name of the opened ebook. 0048 * \return The name of the opened document, or an empty string if no ebook has been loaded. 0049 * \ingroup information 0050 */ 0051 QString title() const override; 0052 0053 /*! 0054 * \brief Gets the default URL of the e-book which should be opened when the book it first open 0055 * 0056 * \return The home page name, with a '/' added in front and relative to 0057 * the root of the archive filesystem. If no book has been opened, returns "/". 0058 * \ingroup information 0059 */ 0060 QUrl homeUrl() const override; 0061 0062 /*! 0063 * \brief Checks whether the specific feature is present in this file. 0064 * \return true if it is available; false otherwise. 0065 * \ingroup information 0066 */ 0067 bool hasFeature(Feature code) const override; 0068 0069 /*! 0070 * \brief Parses and fills up the Table of Contents (TOC) 0071 * \param topics A pointer to the container which will store the parsed results. 0072 * Will be cleaned before parsing. 0073 * \return true if the tree is present and parsed successfully, false otherwise. 0074 * The parser is built to be error-prone, however it still can abort with qFatal() 0075 * by really buggy files; please report a bug if the file is opened ok under Windows. 0076 * \ingroup fileparsing 0077 */ 0078 bool getTableOfContents(QList<EBookTocEntry> &toc) const override; 0079 0080 /*! 0081 * \brief Parses the index table 0082 * \param indexes A pointer to the container which will store the parsed results. 0083 * Will be cleaned before parsing. 0084 * \return true if the tree is present and parsed successfully, false otherwise. 0085 * The parser is built to be error-prone, however it still can abort with qFatal() 0086 * by really buggy chm file; so far it never happened on indexes. 0087 * \ingroup fileparsing 0088 */ 0089 bool getIndex(QList<EBookIndexEntry> &index) const override; 0090 0091 /*! 0092 * \brief Retrieves the content associated with the url from the current ebook as QString. 0093 * \param str A string where the retreived content should be stored. 0094 * \param url An URL in chm file to retreive content from. Must be absolute. 0095 * \return true if the content is successfully received; false otherwise. Note content may be an empty string. 0096 * 0097 * This function retreives the file content (mostly for HTML pages) from the ebook. Because the content 0098 * in chm file might not be stored in Unicode, it will be recoded according to current encoding. 0099 * Do not use for binary data. 0100 * 0101 * \sa setCurrentEncoding() currentEncoding() getFileContentAsBinary() 0102 * \ingroup dataretrieve 0103 */ 0104 bool getFileContentAsString(QString &str, const QUrl &url) const override; 0105 0106 /*! 0107 * \brief Retrieves the content from url in current chm file to QByteArray. 0108 * \param data A data array where the retreived content should be stored. 0109 * \param url An URL in chm file to retreive content from. Must be absolute. 0110 * \return true if the content is successfully received; false otherwise. 0111 * 0112 * This function retreives the file content from the chm archive opened by load() 0113 * function. The content is not encoded. 0114 * 0115 * \sa getFileContentAsString() 0116 * \ingroup dataretrieve 0117 */ 0118 bool getFileContentAsBinary(QByteArray &data, const QUrl &url) const override; 0119 0120 /*! 0121 * \brief Retrieves the content size. 0122 * \param url An URL in ebook file to retreive content from. Must be absolute. 0123 * \return the size; -1 in case of error. 0124 * 0125 * \ingroup dataretrieve 0126 */ 0127 virtual int getContentSize(const QString &url); 0128 0129 /*! 0130 * \brief Obtains the list of all the files (URLs) in current ebook archive. This is used in search 0131 * and to dump the e-book content. 0132 * \param files An array to store list of URLs (file names) present in chm archive. 0133 * \return true if the enumeration succeed; false otherwise (I could hardly imagine a reason). 0134 * 0135 * \ingroup dataretrieve 0136 */ 0137 bool enumerateFiles(QList<QUrl> &files) override; 0138 0139 /*! 0140 * \brief Gets the Title of the page referenced by url. 0141 * \param url An URL in ebook file to get title from. Must be absolute. 0142 * \return The title, or QString() if the URL cannot be found or not a HTML page. 0143 * 0144 * \ingroup dataretrieve 0145 */ 0146 QString getTopicByUrl(const QUrl &url) override; 0147 0148 /*! 0149 * \brief Gets the current ebook encoding (set or autodetected) as qtcodec 0150 * \return The current encoding. 0151 * 0152 * \ingroup encoding 0153 */ 0154 QString currentEncoding() const override; 0155 0156 /*! 0157 * \brief Sets the ebook encoding to use for TOC and content 0158 * \param encoding An encoding to use. 0159 * 0160 * \ingroup encoding 0161 */ 0162 bool setCurrentEncoding(const char *encoding) override; 0163 0164 /*! 0165 * \brief Checks if this kind of URL is supported by the ebook format (i.e. could be passed to ebook functions) 0166 * \param url The url to check 0167 */ 0168 bool isSupportedUrl(const QUrl &url) override; 0169 0170 // Converts the string to the ebook-specific URL format 0171 QUrl pathToUrl(const QString &link) const override; 0172 0173 // Extracts the path component from the URL 0174 QString urlToPath(const QUrl &link) const override; 0175 0176 private: 0177 // Used in local parser 0178 class ParsedEntry 0179 { 0180 public: 0181 ParsedEntry(); 0182 0183 QString name; 0184 QList<QUrl> urls; 0185 int iconid; 0186 int indent; 0187 QString seealso; 0188 }; 0189 0190 //! Looks up fileName in the archive. 0191 bool hasFile(const QString &fileName) const; 0192 0193 //! Looks up fileName in the archive. 0194 bool ResolveObject(const QString &fileName, chmUnitInfo *ui) const; 0195 0196 //! Retrieves an uncompressed chunk of a file in the .chm. 0197 size_t RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, LONGUINT64 fileOffset, LONGINT64 bufferSize) const; 0198 0199 //! Encode the string with the currently selected text codec, if possible. Or return as-is, if not. 0200 inline QString encodeWithCurrentCodec(const QByteArray &str) const 0201 { 0202 return (m_textCodec ? m_textCodec->toUnicode(str.constData()) : QString::fromUtf8(str)); 0203 } 0204 0205 //! Encode the string with the currently selected text codec, if possible. Or return as-is, if not. 0206 inline QString encodeWithCurrentCodec(const char *str) const 0207 { 0208 return (m_textCodec ? m_textCodec->toUnicode(str) : QString::fromUtf8(str)); 0209 } 0210 0211 //! Encode the string from internal files with the currently selected text codec, if possible. 0212 //! Or return as-is, if not. 0213 inline QString encodeInternalWithCurrentCodec(const QString &str) const 0214 { 0215 return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode(qPrintable(str)) : str); 0216 } 0217 0218 //! Encode the string from internal files with the currently selected text codec, if possible. 0219 //! Or return as-is, if not. 0220 inline QString encodeInternalWithCurrentCodec(const char *str) const 0221 { 0222 return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode(str) : QString::fromUtf8(str)); 0223 } 0224 0225 //! Helper. Translates from Win32 encodings to generic wxWidgets ones. 0226 const char *GetFontEncFromCharSet(const QString &font) const; 0227 0228 //! Parse the HHC or HHS file, and fill the context (asIndex is false) or index (asIndex is true) array. 0229 bool parseFileAndFillArray(const QString &file, QList<ParsedEntry> &data, bool asIndex) const; 0230 0231 bool getBinaryContent(QByteArray &data, const QString &url) const; 0232 bool getTextContent(QString &str, const QString &url, bool internal_encoding = false) const; 0233 0234 /*! 0235 * Parse binary TOC 0236 */ 0237 bool parseBinaryTOC(QList<EBookTocEntry> &toc) const; 0238 0239 //! btree string parser 0240 QString getBtreeString(const QByteArray &btidx, unsigned long *offset, unsigned short *spaceLeft) const; 0241 0242 /*! 0243 * Recursively parse and fill binary TOC 0244 */ 0245 bool RecurseLoadBTOC(const QByteArray &tocidx, const QByteArray &topics, const QByteArray &urltbl, const QByteArray &urlstr, const QByteArray &strings, int offset, QList<EBookTocEntry> &entries, int level) const; 0246 0247 /*! 0248 * Helper procedure in TOC parsing, decodes the string between the quotes (first or last) with decoding HTML 0249 * entities like í 0250 */ 0251 int findStringInQuotes(const QString &tag, int offset, QString &value, bool firstquote, bool decodeentities) const; 0252 bool getInfoFromWindows(); 0253 bool getInfoFromSystem(); 0254 bool changeFileEncoding(const QString &qtencoding); 0255 bool guessTextEncoding(); 0256 void fillTopicsUrlMap(); 0257 bool hasOption(const QString &name) const; 0258 0259 // Members 0260 0261 //! Pointer to the chmlib structure 0262 chmFile *m_chmFile; 0263 0264 //! Opened file name 0265 QString m_filename; 0266 0267 //! Home url, got from CHM file 0268 QByteArray m_home; 0269 0270 //! Context tree filename. Got from CHM file 0271 QByteArray m_topicsFile; 0272 0273 //! Index filename. Got from CHM file 0274 QByteArray m_indexFile; 0275 0276 //! Chm Title. Got from CHM file 0277 QByteArray m_title; 0278 0279 // Localization stuff 0280 //! LCID from CHM file, used in encoding detection 0281 short m_detectedLCID; 0282 0283 //! font charset from CHM file, used in encoding detection 0284 QString m_font; 0285 0286 //! Chosen text codec 0287 QTextCodec *m_textCodec; 0288 QTextCodec *m_textCodecForSpecialFiles; 0289 0290 //! Current encoding 0291 QString m_currentEncoding; 0292 0293 //! TRUE if /#TOPICS, /#STRINGS, /#URLTBL and /#URLSTR are resolved, and the members below are valid 0294 bool m_lookupTablesValid; 0295 0296 //! pointer to /#TOPICS 0297 chmUnitInfo m_chmTOPICS; 0298 0299 //! pointer to /#STRINGS 0300 chmUnitInfo m_chmSTRINGS; 0301 0302 //! pointer to /#URLTBL 0303 chmUnitInfo m_chmURLTBL; 0304 0305 //! pointer to /#URLSTR 0306 chmUnitInfo m_chmURLSTR; 0307 0308 //! Indicates whether TOC, either binary or text, is available. 0309 bool m_tocAvailable; 0310 0311 //! Indicates whether index, either binary or text, is available. 0312 bool m_indexAvailable; 0313 0314 //! Map url->topic 0315 QMap<QUrl, QString> m_url2topics; 0316 0317 //! KCHMViewer debug options from environment 0318 QString m_envOptions; 0319 0320 //! HTML entity decoder 0321 HelperEntityDecoder m_htmlEntityDecoder; 0322 }; 0323 0324 #endif // EBOOK_CHM_H