File indexing completed on 2023-09-24 04:06:34
0001 /* 0002 This file is part of the KDE libraries 0003 0004 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 0005 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) 0006 0007 This library is free software; you can redistribute it and/or 0008 modify it under the terms of the GNU Library General Public 0009 License as published by the Free Software Foundation; either 0010 version 2 of the License, or (at your option) any later version. 0011 0012 This library is distributed in the hope that it will be useful, 0013 but WITHOUT ANY WARRANTY; without even the implied warranty of 0014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0015 Library General Public License for more details. 0016 0017 You should have received a copy of the GNU Library General Public License 0018 along with this library; see the file COPYING.LIB. If not, write to 0019 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0020 Boston, MA 02110-1301, USA. 0021 0022 */ 0023 #ifndef KENCODINGDETECTOR_H 0024 #define KENCODINGDETECTOR_H 0025 0026 #include <QString> 0027 0028 class QTextCodec; 0029 class QTextDecoder; 0030 class KEncodingDetectorPrivate; 0031 0032 /** 0033 * @short Provides encoding detection capabilities. 0034 * 0035 * Searches for encoding declaration inside raw data -- meta and xml tags. 0036 * In the case it can't find it, uses heuristics for specified language. 0037 * 0038 * If it finds unicode BOM marks, it changes encoding regardless of what the user has told 0039 * 0040 * Intended lifetime of the object: one instance per document. 0041 * 0042 * Typical use: 0043 * \code 0044 * QByteArray data; 0045 * ... 0046 * KEncodingDetector detector; 0047 * detector.setAutoDetectLanguage(KEncodingDetector::Cyrillic); 0048 * QString out=detector.decode(data); 0049 * \endcode 0050 * 0051 * 0052 * Do not mix decode() with decodeWithBuffering() 0053 * 0054 * @short Guess encoding of char array 0055 * 0056 */ 0057 class KEncodingDetector 0058 { 0059 public: 0060 enum EncodingChoiceSource { 0061 DefaultEncoding, 0062 AutoDetectedEncoding, 0063 BOM, 0064 EncodingFromXMLHeader, 0065 EncodingFromMetaTag, 0066 EncodingFromHTTPHeader, 0067 UserChosenEncoding 0068 }; 0069 0070 enum AutoDetectScript { 0071 None, 0072 SemiautomaticDetection, 0073 Arabic, 0074 Baltic, 0075 CentralEuropean, 0076 ChineseSimplified, 0077 ChineseTraditional, 0078 Cyrillic, 0079 Greek, 0080 Hebrew, 0081 Japanese, 0082 Korean, 0083 NorthernSaami, 0084 SouthEasternEurope, 0085 Thai, 0086 Turkish, 0087 Unicode, 0088 WesternEuropean 0089 }; 0090 0091 /** 0092 * Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic 0093 */ 0094 KEncodingDetector(); 0095 0096 /** 0097 * Allows to set Default codec, EncodingChoiceSource, AutoDetectScript 0098 */ 0099 KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script = None); 0100 ~KEncodingDetector(); 0101 0102 //const QTextCodec* codec() const; 0103 0104 /** 0105 * @returns true if specified encoding was recognized 0106 */ 0107 bool setEncoding(const char *encoding, EncodingChoiceSource type); 0108 0109 /** 0110 * Convenience method. 0111 * @returns mime name of detected encoding 0112 */ 0113 const char *encoding() const; 0114 0115 bool visuallyOrdered() const; 0116 0117 // void setAutoDetectLanguage( const QString& ); 0118 // const QString& autoDetectLanguage() const; 0119 0120 void setAutoDetectLanguage(AutoDetectScript); 0121 AutoDetectScript autoDetectLanguage() const; 0122 0123 EncodingChoiceSource encodingChoiceSource() const; 0124 0125 /** 0126 * The main class method 0127 * 0128 * Calls protected analyze() only the first time of the whole object life 0129 * 0130 * Replaces all null chars with spaces. 0131 */ 0132 QString decode(const char *data, int len); 0133 QString decode(const QByteArray &data); 0134 0135 //* You don't need to call analyze() if you use this method. 0136 /** 0137 * Convenience method that uses buffering. It waits for full html head to be buffered 0138 * (i.e. calls analyze every time until it returns true). 0139 * 0140 * Replaces all null chars with spaces. 0141 * 0142 * @returns Decoded data, or empty string, if there was not enough data for accurate detection 0143 * @see flush() 0144 */ 0145 QString decodeWithBuffering(const char *data, int len); 0146 0147 /** 0148 * This method checks whether invalid characters were found 0149 * during a decoding operation. 0150 * 0151 * Note that this bit is never reset once invalid characters have been found. 0152 * To force a reset, either change the encoding using setEncoding() or call 0153 * resetDecoder() 0154 * 0155 * @returns a boolean reflecting said state. 0156 * @since 4.3 0157 * @see resetDecoder() setEncoding() 0158 */ 0159 bool decodedInvalidCharacters() const; 0160 0161 /** 0162 * Resets the decoder. Any stateful decoding information (such as resulting from previous calls 0163 * to decodeWithBuffering()) will be lost. 0164 * Will Reset the state of decodedInvalidCharacters() as a side effect. 0165 * 0166 * @since 4.3 0167 * @see decodeWithBuffering() decodedInvalidCharacters() 0168 * 0169 */ 0170 void resetDecoder(); 0171 0172 /** 0173 * Convenience method to be used with decodeForHtml. Flushes buffer. 0174 * @see decodeForHtml() 0175 */ 0176 QString flush(); 0177 0178 /** 0179 * Takes lang name _after_ it were i18n()'ed 0180 */ 0181 static AutoDetectScript scriptForName(const QString &lang); 0182 static QString nameForScript(AutoDetectScript); 0183 static bool hasAutoDetectionForScript(AutoDetectScript); 0184 0185 protected: 0186 /** 0187 * This nice method will kill all 0 bytes (or double bytes) 0188 * and remember if this was a binary or not ;) 0189 */ 0190 bool processNull(char *data, int length); 0191 0192 /** 0193 * Check if we are really utf8. Taken from kate 0194 * 0195 * @returns true if current encoding is utf8 and the text cannot be in this encoding 0196 * 0197 * Please somebody read https://en.wikipedia.org/wiki/UTF-8 and check this code... 0198 */ 0199 bool errorsIfUtf8(const char *data, int length); 0200 0201 /** 0202 * Analyze text data. 0203 * @returns true if there was enough data for accurate detection 0204 */ 0205 bool analyze(const char *data, int len); 0206 0207 /** 0208 * @returns QTextDecoder for detected encoding 0209 */ 0210 QTextDecoder *decoder(); 0211 0212 private: 0213 KEncodingDetectorPrivate *const d; 0214 }; 0215 0216 #endif