File indexing completed on 2024-06-16 04:20:01

0001 /*
0002     Kchmviewer - a CHM and EPUB file viewer with broad language support
0003     SPDX-FileCopyrightText: 2004-2014 George Yunaev gyunaev@ulduzsoft.com
0004 
0005     SPDX-License-Identifier: GPL-3.0-or-later
0006 */
0007 
0008 #ifndef EBOOK_CHM_H
0009 #define EBOOK_CHM_H
0010 
0011 #include <QMap>
0012 #include <QTextCodec>
0013 
0014 // Enable Unicode use in libchm
0015 #if defined(WIN32)
0016 #define PPC_BSTR
0017 #endif
0018 #include <chm_lib.h>
0019 
0020 #include "ebook.h"
0021 #include "helper_entitydecoder.h"
0022 
0023 class EBook_CHM : public EBook
0024 {
0025 public:
0026     EBook_CHM();
0027     ~EBook_CHM() override;
0028 
0029     /*!
0030      * \brief Attempts to load chm file.
0031      * \param archiveName filename.
0032      * \return EBook object on success, NULL on failure.
0033      *
0034      * Loads a CHM file. For CHM files it could internally load more than one file,
0035      * if files linked to this one are present locally (like MSDN).
0036      * \ingroup init
0037      */
0038     bool load(const QString &archiveName) override;
0039 
0040     /*!
0041      * \brief Closes all the files, and frees the appropriate data.
0042      * \ingroup init
0043      */
0044     void close() override;
0045 
0046     /*!
0047      * \brief Gets the title name of the opened ebook.
0048      * \return The name of the opened document, or an empty string if no ebook has been loaded.
0049      * \ingroup information
0050      */
0051     QString title() const override;
0052 
0053     /*!
0054      * \brief Gets the default URL of the e-book which should be opened when the book it first open
0055      *
0056      * \return The home page name, with a '/' added in front and relative to
0057      *         the root of the archive filesystem. If no book has been opened, returns "/".
0058      * \ingroup information
0059      */
0060     QUrl homeUrl() const override;
0061 
0062     /*!
0063      * \brief Checks whether the specific feature is present in this file.
0064      * \return true if it is available; false otherwise.
0065      * \ingroup information
0066      */
0067     bool hasFeature(Feature code) const override;
0068 
0069     /*!
0070      * \brief Parses and fills up the Table of Contents (TOC)
0071      * \param topics A pointer to the container which will store the parsed results.
0072      *               Will be cleaned before parsing.
0073      * \return true if the tree is present and parsed successfully, false otherwise.
0074      *         The parser is built to be error-prone, however it still can abort with qFatal()
0075      *         by really buggy files; please report a bug if the file is opened ok under Windows.
0076      * \ingroup fileparsing
0077      */
0078     bool getTableOfContents(QList<EBookTocEntry> &toc) const override;
0079 
0080     /*!
0081      * \brief Parses the index table
0082      * \param indexes A pointer to the container which will store the parsed results.
0083      *               Will be cleaned before parsing.
0084      * \return true if the tree is present and parsed successfully, false otherwise.
0085      *         The parser is built to be error-prone, however it still can abort with qFatal()
0086      *         by really buggy chm file; so far it never happened on indexes.
0087      * \ingroup fileparsing
0088      */
0089     bool getIndex(QList<EBookIndexEntry> &index) const override;
0090 
0091     /*!
0092      * \brief Retrieves the content associated with the url from the current ebook as QString.
0093      * \param str A string where the retreived content should be stored.
0094      * \param url An URL in chm file to retreive content from. Must be absolute.
0095      * \return true if the content is successfully received; false otherwise. Note content may be an empty string.
0096      *
0097      * This function retreives the file content (mostly for HTML pages) from the ebook. Because the content
0098      * in chm file might not be stored in Unicode, it will be recoded according to current encoding.
0099      * Do not use for binary data.
0100      *
0101      * \sa setCurrentEncoding() currentEncoding() getFileContentAsBinary()
0102      * \ingroup dataretrieve
0103      */
0104     bool getFileContentAsString(QString &str, const QUrl &url) const override;
0105 
0106     /*!
0107      * \brief Retrieves the content from url in current chm file to QByteArray.
0108      * \param data A data array where the retreived content should be stored.
0109      * \param url An URL in chm file to retreive content from. Must be absolute.
0110      * \return true if the content is successfully received; false otherwise.
0111      *
0112      * This function retreives the file content from the chm archive opened by load()
0113      * function. The content is not encoded.
0114      *
0115      * \sa getFileContentAsString()
0116      * \ingroup dataretrieve
0117      */
0118     bool getFileContentAsBinary(QByteArray &data, const QUrl &url) const override;
0119 
0120     /*!
0121      * \brief Retrieves the content size.
0122      * \param url An URL in ebook file to retreive content from. Must be absolute.
0123      * \return the size; -1 in case of error.
0124      *
0125      * \ingroup dataretrieve
0126      */
0127     virtual int getContentSize(const QString &url);
0128 
0129     /*!
0130      * \brief Obtains the list of all the files (URLs) in current ebook archive. This is used in search
0131      * and to dump the e-book content.
0132      * \param files An array to store list of URLs (file names) present in chm archive.
0133      * \return true if the enumeration succeed; false otherwise (I could hardly imagine a reason).
0134      *
0135      * \ingroup dataretrieve
0136      */
0137     bool enumerateFiles(QList<QUrl> &files) override;
0138 
0139     /*!
0140      * \brief Gets the Title of the page referenced by url.
0141      * \param url An URL in ebook file to get title from. Must be absolute.
0142      * \return The title, or QString() if the URL cannot be found or not a HTML page.
0143      *
0144      * \ingroup dataretrieve
0145      */
0146     QString getTopicByUrl(const QUrl &url) override;
0147 
0148     /*!
0149      * \brief Gets the current ebook encoding (set or autodetected) as qtcodec
0150      * \return The current encoding.
0151      *
0152      * \ingroup encoding
0153      */
0154     QString currentEncoding() const override;
0155 
0156     /*!
0157      * \brief Sets the ebook encoding to use for TOC and content
0158      * \param encoding An encoding to use.
0159      *
0160      * \ingroup encoding
0161      */
0162     bool setCurrentEncoding(const char *encoding) override;
0163 
0164     /*!
0165      * \brief Checks if this kind of URL is supported by the ebook format (i.e. could be passed to ebook functions)
0166      * \param url The url to check
0167      */
0168     bool isSupportedUrl(const QUrl &url) override;
0169 
0170     // Converts the string to the ebook-specific URL format
0171     QUrl pathToUrl(const QString &link) const override;
0172 
0173     // Extracts the path component from the URL
0174     QString urlToPath(const QUrl &link) const override;
0175 
0176 private:
0177     // Used in local parser
0178     class ParsedEntry
0179     {
0180     public:
0181         ParsedEntry();
0182 
0183         QString name;
0184         QList<QUrl> urls;
0185         int iconid;
0186         int indent;
0187         QString seealso;
0188     };
0189 
0190     //! Looks up fileName in the archive.
0191     bool hasFile(const QString &fileName) const;
0192 
0193     //! Looks up fileName in the archive.
0194     bool ResolveObject(const QString &fileName, chmUnitInfo *ui) const;
0195 
0196     //!  Retrieves an uncompressed chunk of a file in the .chm.
0197     size_t RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, LONGUINT64 fileOffset, LONGINT64 bufferSize) const;
0198 
0199     //! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
0200     inline QString encodeWithCurrentCodec(const QByteArray &str) const
0201     {
0202         return (m_textCodec ? m_textCodec->toUnicode(str.constData()) : QString::fromUtf8(str));
0203     }
0204 
0205     //! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
0206     inline QString encodeWithCurrentCodec(const char *str) const
0207     {
0208         return (m_textCodec ? m_textCodec->toUnicode(str) : QString::fromUtf8(str));
0209     }
0210 
0211     //! Encode the string from internal files with the currently selected text codec, if possible.
0212     //! Or return as-is, if not.
0213     inline QString encodeInternalWithCurrentCodec(const QString &str) const
0214     {
0215         return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode(qPrintable(str)) : str);
0216     }
0217 
0218     //! Encode the string from internal files with the currently selected text codec, if possible.
0219     //! Or return as-is, if not.
0220     inline QString encodeInternalWithCurrentCodec(const char *str) const
0221     {
0222         return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode(str) : QString::fromUtf8(str));
0223     }
0224 
0225     //! Helper. Translates from Win32 encodings to generic wxWidgets ones.
0226     const char *GetFontEncFromCharSet(const QString &font) const;
0227 
0228     //! Parse the HHC or HHS file, and fill the context (asIndex is false) or index (asIndex is true) array.
0229     bool parseFileAndFillArray(const QString &file, QList<ParsedEntry> &data, bool asIndex) const;
0230 
0231     bool getBinaryContent(QByteArray &data, const QString &url) const;
0232     bool getTextContent(QString &str, const QString &url, bool internal_encoding = false) const;
0233 
0234     /*!
0235      * Parse binary TOC
0236      */
0237     bool parseBinaryTOC(QList<EBookTocEntry> &toc) const;
0238 
0239     //! btree string parser
0240     QString getBtreeString(const QByteArray &btidx, unsigned long *offset, unsigned short *spaceLeft) const;
0241 
0242     /*!
0243      * Recursively parse and fill binary TOC
0244      */
0245     bool RecurseLoadBTOC(const QByteArray &tocidx, const QByteArray &topics, const QByteArray &urltbl, const QByteArray &urlstr, const QByteArray &strings, int offset, QList<EBookTocEntry> &entries, int level) const;
0246 
0247     /*!
0248      * Helper procedure in TOC parsing, decodes the string between the quotes (first or last) with decoding HTML
0249      * entities like &iacute;
0250      */
0251     int findStringInQuotes(const QString &tag, int offset, QString &value, bool firstquote, bool decodeentities) const;
0252     bool getInfoFromWindows();
0253     bool getInfoFromSystem();
0254     bool changeFileEncoding(const QString &qtencoding);
0255     bool guessTextEncoding();
0256     void fillTopicsUrlMap();
0257     bool hasOption(const QString &name) const;
0258 
0259     // Members
0260 
0261     //! Pointer to the chmlib structure
0262     chmFile *m_chmFile;
0263 
0264     //! Opened file name
0265     QString m_filename;
0266 
0267     //! Home url, got from CHM file
0268     QByteArray m_home;
0269 
0270     //! Context tree filename. Got from CHM file
0271     QByteArray m_topicsFile;
0272 
0273     //! Index filename. Got from CHM file
0274     QByteArray m_indexFile;
0275 
0276     //! Chm Title. Got from CHM file
0277     QByteArray m_title;
0278 
0279     // Localization stuff
0280     //! LCID from CHM file, used in encoding detection
0281     short m_detectedLCID;
0282 
0283     //! font charset from CHM file, used in encoding detection
0284     QString m_font;
0285 
0286     //! Chosen text codec
0287     QTextCodec *m_textCodec;
0288     QTextCodec *m_textCodecForSpecialFiles;
0289 
0290     //! Current encoding
0291     QString m_currentEncoding;
0292 
0293     //! TRUE if /#TOPICS, /#STRINGS, /#URLTBL and  /#URLSTR are resolved, and the members below are valid
0294     bool m_lookupTablesValid;
0295 
0296     //! pointer to /#TOPICS
0297     chmUnitInfo m_chmTOPICS;
0298 
0299     //! pointer to /#STRINGS
0300     chmUnitInfo m_chmSTRINGS;
0301 
0302     //! pointer to /#URLTBL
0303     chmUnitInfo m_chmURLTBL;
0304 
0305     //! pointer to /#URLSTR
0306     chmUnitInfo m_chmURLSTR;
0307 
0308     //! Indicates whether TOC, either binary or text, is available.
0309     bool m_tocAvailable;
0310 
0311     //! Indicates whether index, either binary or text, is available.
0312     bool m_indexAvailable;
0313 
0314     //! Map url->topic
0315     QMap<QUrl, QString> m_url2topics;
0316 
0317     //! KCHMViewer debug options from environment
0318     QString m_envOptions;
0319 
0320     //! HTML entity decoder
0321     HelperEntityDecoder m_htmlEntityDecoder;
0322 };
0323 
0324 #endif // EBOOK_CHM_H