File indexing completed on 2024-09-15 11:55:02
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 #ifndef KENCODINGPROBER_H 0009 #define KENCODINGPROBER_H 0010 0011 // enable debug of private probers 0012 // #define DEBUG_PROBE 0013 0014 #include <kcodecs_export.h> 0015 0016 #ifdef DEBUG_PROBE 0017 #include <QDebug> 0018 #endif 0019 0020 #include <QCoreApplication> 0021 #include <QString> 0022 #include <memory> 0023 0024 class KEncodingProberPrivate; 0025 0026 /** 0027 * @class KEncodingProber kencodingprober.h KEncodingProber 0028 * 0029 * @short Provides encoding detection(probe) capabilities. 0030 * 0031 * Probe the encoding of raw data only. 0032 * In the case it can't find it, return the most possible encoding it guessed. 0033 * 0034 * Always do Unicode probe regardless the ProberType 0035 * 0036 * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, 0037 * or confidence() returns a value you find acceptable. 0038 * 0039 * Intended lifetime of the object: one instance per ProberType. 0040 * 0041 * Typical use: 0042 * \code 0043 * QByteArray data, moredata; 0044 * ... 0045 * KEncodingProber prober(KEncodingProber::Chinese); 0046 * prober.feed(data); 0047 * prober.feed(moredata); 0048 * if (prober.confidence() > 0.6) 0049 * encoding = prober.encoding(); 0050 * \endcode 0051 * 0052 * At least 256 characters are needed to change the ProberState from Probing to FoundIt. 0053 * If you don't have so many characters to probe, 0054 * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. 0055 * 0056 * @short Guess encoding of char array 0057 * 0058 */ 0059 class KCODECS_EXPORT KEncodingProber 0060 { 0061 Q_DECLARE_TR_FUNCTIONS(KEncodingProber) 0062 0063 public: 0064 enum ProberState { 0065 FoundIt, /**< Sure find the encoding */ 0066 NotMe, /**< Sure not included in current ProberType's all supported encodings */ 0067 Probing, /**< Need more data to make a decision */ 0068 }; 0069 0070 enum ProberType { 0071 None, 0072 Universal, 0073 Arabic, 0074 Baltic, 0075 CentralEuropean, 0076 ChineseSimplified, 0077 ChineseTraditional, 0078 Cyrillic, 0079 Greek, 0080 Hebrew, 0081 Japanese, 0082 Korean, 0083 NorthernSaami, 0084 Other, 0085 SouthEasternEurope, 0086 Thai, 0087 Turkish, 0088 Unicode, 0089 WesternEuropean, 0090 }; 0091 0092 /** 0093 * Default ProberType is Universal(detect all possible encodings) 0094 */ 0095 KEncodingProber(ProberType proberType = Universal); 0096 0097 ~KEncodingProber(); 0098 0099 KEncodingProber(const KEncodingProber &) = delete; 0100 KEncodingProber &operator=(const KEncodingProber &) = delete; 0101 0102 /** 0103 * reset the prober's internal state and data. 0104 */ 0105 void reset(); 0106 0107 /** 0108 * The main class method 0109 * 0110 * feed data to the prober 0111 * 0112 * @returns the ProberState after probing the fed data. 0113 */ 0114 ProberState feed(const QByteArray &data); 0115 ProberState feed(const char *data, int len); 0116 0117 /** 0118 * @returns the prober's current ProberState 0119 * 0120 */ 0121 ProberState state() const; 0122 0123 /** 0124 * @returns a QByteArray with the name of the best encoding it has guessed so far 0125 * @since 4.2.2 0126 */ 0127 QByteArray encoding() const; 0128 0129 /** 0130 * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings 0131 */ 0132 float confidence() const; 0133 0134 ProberType proberType() const; 0135 0136 /** 0137 * change current prober's ProberType and reset the prober 0138 */ 0139 void setProberType(ProberType proberType); 0140 0141 /** 0142 * @return the ProberType for lang (eg. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified 0143 */ 0144 static ProberType proberTypeForName(const QString &lang); 0145 0146 /** 0147 * map ProberType to language string 0148 */ 0149 static QString nameForProberType(ProberType proberType); 0150 0151 private: 0152 std::unique_ptr<KEncodingProberPrivate> const d; 0153 }; 0154 0155 #endif