File indexing completed on 2024-10-06 03:37:44
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 #ifndef KENCODINGPROBER_H 0009 #define KENCODINGPROBER_H 0010 0011 // enable debug of private probers 0012 // #define DEBUG_PROBE 0013 0014 #include <kcodecs_export.h> 0015 0016 #ifdef DEBUG_PROBE 0017 #include <QDebug> 0018 #endif 0019 0020 #include <QCoreApplication> 0021 #include <QString> 0022 #include <memory> 0023 0024 class KEncodingProberPrivate; 0025 0026 /** 0027 * @class KEncodingProber kencodingprober.h KEncodingProber 0028 * 0029 * @short Provides encoding detection(probe) capabilities. 0030 * 0031 * Probe the encoding of raw data only. 0032 * In the case it can't find it, return the most possible encoding it guessed. 0033 * 0034 * Always do Unicode probe regardless the ProberType 0035 * 0036 * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, 0037 * or confidence() returns a value you find acceptable. 0038 * 0039 * Intended lifetime of the object: one instance per ProberType. 0040 * 0041 * Typical use: 0042 * \code 0043 * QByteArray data, moredata; 0044 * ... 0045 * KEncodingProber prober(KEncodingProber::Chinese); 0046 * prober.feed(data); 0047 * prober.feed(moredata); 0048 * if (prober.confidence() > 0.6) 0049 * encoding = prober.encoding(); 0050 * \endcode 0051 * 0052 * At least 256 characters are needed to change the ProberState from Probing to FoundIt. 0053 * If you don't have so many characters to probe, 0054 * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. 0055 * 0056 * @short Guess encoding of char array 0057 * 0058 */ 0059 class KCODECS_EXPORT KEncodingProber 0060 { 0061 Q_DECLARE_TR_FUNCTIONS(KEncodingProber) 0062 0063 public: 0064 enum ProberState { 0065 FoundIt, /**< Sure find the encoding */ 0066 NotMe, /**< Sure not included in current ProberType's all supported encodings */ 0067 Probing, /**< Need more data to make a decision */ 0068 }; 0069 0070 enum ProberType { 0071 None, 0072 Universal, 0073 Arabic, 0074 Baltic, 0075 CentralEuropean, 0076 ChineseSimplified, 0077 ChineseTraditional, 0078 Cyrillic, 0079 Greek, 0080 Hebrew, 0081 Japanese, 0082 Korean, 0083 NorthernSaami, 0084 Other, 0085 SouthEasternEurope, 0086 Thai, 0087 Turkish, 0088 Unicode, 0089 WesternEuropean, 0090 }; 0091 0092 /** 0093 * Default ProberType is Universal(detect all possible encodings) 0094 */ 0095 KEncodingProber(ProberType proberType = Universal); 0096 0097 ~KEncodingProber(); 0098 0099 KEncodingProber(const KEncodingProber &) = delete; 0100 KEncodingProber &operator=(const KEncodingProber &) = delete; 0101 0102 /** 0103 * reset the prober's internal state and data. 0104 */ 0105 void reset(); 0106 0107 /** 0108 * The main class method 0109 * 0110 * feed data to the prober 0111 * 0112 * @returns the ProberState after probing the fed data. 0113 */ 0114 ProberState feed(QByteArrayView data); 0115 // for API compatibility 0116 inline ProberState feed(const char *data, qsizetype len) 0117 { 0118 return feed({data, len}); 0119 } 0120 0121 /** 0122 * @returns the prober's current ProberState 0123 * 0124 */ 0125 ProberState state() const; 0126 0127 /** 0128 * @returns a QByteArray with the name of the best encoding it has guessed so far 0129 * @since 4.2.2 0130 */ 0131 QByteArray encoding() const; 0132 0133 /** 0134 * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings 0135 */ 0136 float confidence() const; 0137 0138 ProberType proberType() const; 0139 0140 /** 0141 * change current prober's ProberType and reset the prober 0142 */ 0143 void setProberType(ProberType proberType); 0144 0145 /** 0146 * @return the ProberType for lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified 0147 */ 0148 static ProberType proberTypeForName(const QString &lang); 0149 0150 /** 0151 * map ProberType to language string 0152 */ 0153 static QString nameForProberType(ProberType proberType); 0154 0155 private: 0156 std::unique_ptr<KEncodingProberPrivate> const d; 0157 }; 0158 0159 #endif