File indexing completed on 2024-04-28 11:38:30

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
0005     Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
0006 
0007     This library is free software; you can redistribute it and/or
0008     modify it under the terms of the GNU Library General Public
0009     License as published by the Free Software Foundation; either
0010     version 2 of the License, or (at your option) any later version.
0011 
0012     This library is distributed in the hope that it will be useful,
0013     but WITHOUT ANY WARRANTY; without even the implied warranty of
0014     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0015     Library General Public License for more details.
0016 
0017     You should have received a copy of the GNU Library General Public License
0018     along with this library; see the file COPYING.LIB.  If not, write to
0019     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0020     Boston, MA 02110-1301, USA.
0021 
0022 */
0023 #ifndef KENCODINGDETECTOR_H
0024 #define KENCODINGDETECTOR_H
0025 
0026 #include <QString>
0027 
0028 class QTextCodec;
0029 class QTextDecoder;
0030 class KEncodingDetectorPrivate;
0031 
0032 /**
0033  * @short Provides encoding detection capabilities.
0034  *
0035  * Searches for encoding declaration inside raw data -- meta and xml tags.
0036  * In the case it can't find it, uses heuristics for specified language.
0037  *
0038  * If it finds unicode BOM marks, it changes encoding regardless of what the user has told
0039  *
0040  * Intended lifetime of the object: one instance per document.
0041  *
0042  * Typical use:
0043  * \code
0044  * QByteArray data;
0045  * ...
0046  * KEncodingDetector detector;
0047  * detector.setAutoDetectLanguage(KEncodingDetector::Cyrillic);
0048  * QString out=detector.decode(data);
0049  * \endcode
0050  *
0051  *
0052  * Do not mix decode() with decodeWithBuffering()
0053  *
0054  * @short Guess encoding of char array
0055  *
0056  */
0057 class KEncodingDetector
0058 {
0059 public:
0060     enum EncodingChoiceSource {
0061         DefaultEncoding,
0062         AutoDetectedEncoding,
0063         BOM,
0064         EncodingFromXMLHeader,
0065         EncodingFromMetaTag,
0066         EncodingFromHTTPHeader,
0067         UserChosenEncoding
0068     };
0069 
0070     enum AutoDetectScript {
0071         None,
0072         SemiautomaticDetection,
0073         Arabic,
0074         Baltic,
0075         CentralEuropean,
0076         ChineseSimplified,
0077         ChineseTraditional,
0078         Cyrillic,
0079         Greek,
0080         Hebrew,
0081         Japanese,
0082         Korean,
0083         NorthernSaami,
0084         SouthEasternEurope,
0085         Thai,
0086         Turkish,
0087         Unicode,
0088         WesternEuropean
0089     };
0090 
0091     /**
0092      * Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic
0093      */
0094     KEncodingDetector();
0095 
0096     /**
0097      * Allows to set Default codec, EncodingChoiceSource, AutoDetectScript
0098      */
0099     KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script = None);
0100     ~KEncodingDetector();
0101 
0102     //const QTextCodec* codec() const;
0103 
0104     /**
0105     * @returns true if specified encoding was recognized
0106     */
0107     bool setEncoding(const char *encoding, EncodingChoiceSource type);
0108 
0109     /**
0110     * Convenience method.
0111     * @returns mime name of detected encoding
0112     */
0113     const char *encoding() const;
0114 
0115     bool visuallyOrdered() const;
0116 
0117 //     void setAutoDetectLanguage( const QString& );
0118 //     const QString& autoDetectLanguage() const;
0119 
0120     void setAutoDetectLanguage(AutoDetectScript);
0121     AutoDetectScript autoDetectLanguage() const;
0122 
0123     EncodingChoiceSource encodingChoiceSource() const;
0124 
0125     /**
0126     * The main class method
0127     *
0128     * Calls protected analyze() only the first time of the whole object life
0129     *
0130     * Replaces all null chars with spaces.
0131     */
0132     QString decode(const char *data, int len);
0133     QString decode(const QByteArray &data);
0134 
0135     //* You don't need to call analyze() if you use this method.
0136     /**
0137     * Convenience method that uses buffering. It waits for full html head to be buffered
0138     * (i.e. calls analyze every time until it returns true).
0139     *
0140     * Replaces all null chars with spaces.
0141     *
0142     * @returns Decoded data, or empty string, if there was not enough data for accurate detection
0143     * @see flush()
0144     */
0145     QString decodeWithBuffering(const char *data, int len);
0146 
0147     /**
0148      * This method checks whether invalid characters were found
0149      * during a decoding operation.
0150      *
0151      * Note that this bit is never reset once invalid characters have been found.
0152      * To force a reset, either change the encoding using setEncoding() or call
0153      * resetDecoder()
0154      *
0155      * @returns a boolean reflecting said state.
0156      * @since 4.3
0157      * @see resetDecoder() setEncoding()
0158      */
0159     bool decodedInvalidCharacters() const;
0160 
0161     /**
0162      * Resets the decoder. Any stateful decoding information (such as resulting from previous calls
0163      * to decodeWithBuffering()) will be lost.
0164      * Will Reset the state of decodedInvalidCharacters() as a side effect.
0165      *
0166      * @since 4.3
0167      * @see decodeWithBuffering() decodedInvalidCharacters()
0168      *
0169      */
0170     void resetDecoder();
0171 
0172     /**
0173     * Convenience method to be used with decodeForHtml. Flushes buffer.
0174     * @see decodeForHtml()
0175     */
0176     QString flush();
0177 
0178     /**
0179      * Takes lang name _after_ it were i18n()'ed
0180      */
0181     static AutoDetectScript scriptForName(const QString &lang);
0182     static QString nameForScript(AutoDetectScript);
0183     static bool hasAutoDetectionForScript(AutoDetectScript);
0184 
0185 protected:
0186     /**
0187      * This nice method will kill all 0 bytes (or double bytes)
0188      * and remember if this was a binary or not ;)
0189      */
0190     bool processNull(char *data, int length);
0191 
0192     /**
0193      * Check if we are really utf8. Taken from kate
0194      *
0195      * @returns true if current encoding is utf8 and the text cannot be in this encoding
0196      *
0197      * Please somebody read https://en.wikipedia.org/wiki/UTF-8 and check this code...
0198      */
0199     bool errorsIfUtf8(const char *data, int length);
0200 
0201     /**
0202     * Analyze text data.
0203     * @returns true if there was enough data for accurate detection
0204     */
0205     bool analyze(const char *data, int len);
0206 
0207     /**
0208     * @returns QTextDecoder for detected encoding
0209     */
0210     QTextDecoder *decoder();
0211 
0212 private:
0213     KEncodingDetectorPrivate *const d;
0214 };
0215 
0216 #endif