Warning, file /frameworks/kcodecs/src/kencodingprober.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 0009 #include "kencodingprober.h" 0010 0011 #include "probers/ChineseGroupProber.h" 0012 #include "probers/JapaneseGroupProber.h" 0013 #include "probers/UnicodeGroupProber.h" 0014 #include "probers/nsCharSetProber.h" 0015 #include "probers/nsMBCSGroupProber.h" 0016 #include "probers/nsSBCSGroupProber.h" 0017 #include "probers/nsUniversalDetector.h" 0018 0019 #include <string.h> 0020 0021 class KEncodingProberPrivate 0022 { 0023 public: 0024 KEncodingProberPrivate() 0025 : mProber(nullptr) 0026 , mStart(true) 0027 { 0028 } 0029 ~KEncodingProberPrivate() 0030 { 0031 delete mProber; 0032 } 0033 void setProberType(KEncodingProber::ProberType pType) 0034 { 0035 mProberType = pType; 0036 /* handle multi-byte encodings carefully , because they're hard to detect, 0037 * and have to use some Stastics methods. 0038 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok, 0039 * because encoding state machine can detect many such encodings. 0040 */ 0041 0042 delete mProber; 0043 0044 switch (mProberType) { 0045 case KEncodingProber::None: 0046 mProber = nullptr; 0047 break; 0048 case KEncodingProber::Arabic: 0049 case KEncodingProber::Baltic: 0050 case KEncodingProber::CentralEuropean: 0051 case KEncodingProber::Cyrillic: 0052 case KEncodingProber::Greek: 0053 case KEncodingProber::Hebrew: 0054 case KEncodingProber::NorthernSaami: 0055 case KEncodingProber::Other: 0056 case KEncodingProber::SouthEasternEurope: 0057 case KEncodingProber::Thai: 0058 case KEncodingProber::Turkish: 0059 case KEncodingProber::WesternEuropean: 0060 mProber = new kencodingprober::nsSBCSGroupProber(); 0061 break; 0062 case KEncodingProber::ChineseSimplified: 0063 case KEncodingProber::ChineseTraditional: 0064 mProber = new kencodingprober::ChineseGroupProber(); 0065 break; 0066 case KEncodingProber::Japanese: 0067 mProber = new kencodingprober::JapaneseGroupProber(); 0068 break; 0069 case KEncodingProber::Korean: 0070 mProber = new kencodingprober::nsMBCSGroupProber(); 0071 break; 0072 case KEncodingProber::Unicode: 0073 mProber = new kencodingprober::UnicodeGroupProber(); 0074 break; 0075 case KEncodingProber::Universal: 0076 mProber = new kencodingprober::nsUniversalDetector(); 0077 break; 0078 default: 0079 mProber = nullptr; 0080 } 0081 } 0082 void unicodeTest(const char *aBuf, int aLen) 0083 { 0084 if (mStart) { 0085 mStart = false; 0086 if (aLen > 3) { 0087 switch (aBuf[0]) { 0088 case '\xEF': 0089 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 0090 // EF BB BF UTF-8 encoded BOM 0091 { 0092 mProberState = KEncodingProber::FoundIt; 0093 } 0094 break; 0095 case '\xFE': 0096 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 0097 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 0098 { 0099 mProberState = KEncodingProber::FoundIt; 0100 } else if ('\xFF' == aBuf[1]) 0101 // FE FF UTF-16, big endian BOM 0102 { 0103 mProberState = KEncodingProber::FoundIt; 0104 } 0105 break; 0106 case '\x00': 0107 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 0108 // 00 00 FE FF UTF-32, big-endian BOM 0109 { 0110 mProberState = KEncodingProber::FoundIt; 0111 } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 0112 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 0113 { 0114 mProberState = KEncodingProber::FoundIt; 0115 } 0116 break; 0117 case '\xFF': 0118 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 0119 // FF FE 00 00 UTF-32, little-endian BOM 0120 { 0121 mProberState = KEncodingProber::FoundIt; 0122 } else if ('\xFE' == aBuf[1]) 0123 // FF FE UTF-16, little endian BOM 0124 { 0125 mProberState = KEncodingProber::FoundIt; 0126 } 0127 break; 0128 } // switch 0129 } 0130 } 0131 } 0132 KEncodingProber::ProberType mProberType; 0133 KEncodingProber::ProberState mProberState; 0134 kencodingprober::nsCharSetProber *mProber; 0135 bool mStart; 0136 }; 0137 0138 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType) 0139 : d(new KEncodingProberPrivate()) 0140 { 0141 setProberType(proberType); 0142 } 0143 0144 KEncodingProber::~KEncodingProber() = default; 0145 0146 void KEncodingProber::reset() 0147 { 0148 d->mProberState = KEncodingProber::Probing; 0149 d->mStart = true; 0150 } 0151 0152 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data) 0153 { 0154 return feed(data.data(), data.size()); 0155 } 0156 0157 KEncodingProber::ProberState KEncodingProber::feed(const char *data, int len) 0158 { 0159 if (!d->mProber) { 0160 return d->mProberState; 0161 } 0162 if (d->mProberState == Probing) { 0163 if (d->mStart) { 0164 d->unicodeTest(data, len); 0165 if (d->mProberState == FoundIt) { 0166 return d->mProberState; 0167 } 0168 } 0169 d->mProber->HandleData(data, len); 0170 switch (d->mProber->GetState()) { 0171 case kencodingprober::eNotMe: 0172 d->mProberState = NotMe; 0173 break; 0174 case kencodingprober::eFoundIt: 0175 d->mProberState = FoundIt; 0176 break; 0177 default: 0178 d->mProberState = Probing; 0179 break; 0180 } 0181 } 0182 #ifdef DEBUG_PROBE 0183 d->mProber->DumpStatus(); 0184 #endif 0185 return d->mProberState; 0186 } 0187 0188 KEncodingProber::ProberState KEncodingProber::state() const 0189 { 0190 return d->mProberState; 0191 } 0192 0193 QByteArray KEncodingProber::encoding() const 0194 { 0195 if (!d->mProber) { 0196 return QByteArray("UTF-8"); 0197 } 0198 0199 return QByteArray(d->mProber->GetCharSetName()); 0200 } 0201 0202 float KEncodingProber::confidence() const 0203 { 0204 if (!d->mProber) { 0205 return 0.0; 0206 } 0207 0208 return d->mProber->GetConfidence(); 0209 } 0210 0211 KEncodingProber::ProberType KEncodingProber::proberType() const 0212 { 0213 return d->mProberType; 0214 } 0215 0216 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType) 0217 { 0218 d->setProberType(proberType); 0219 reset(); 0220 } 0221 0222 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang) 0223 { 0224 if (lang.isEmpty()) { 0225 return KEncodingProber::Universal; 0226 } else if (lang == tr("Disabled", "@item Text character set")) { 0227 return KEncodingProber::None; 0228 } else if (lang == tr("Universal", "@item Text character set")) { 0229 return KEncodingProber::Universal; 0230 } else if (lang == tr("Unicode", "@item Text character set")) { 0231 return KEncodingProber::Unicode; 0232 } else if (lang == tr("Cyrillic", "@item Text character set")) { 0233 return KEncodingProber::Cyrillic; 0234 } else if (lang == tr("Western European", "@item Text character set")) { 0235 return KEncodingProber::WesternEuropean; 0236 } else if (lang == tr("Central European", "@item Text character set")) { 0237 return KEncodingProber::CentralEuropean; 0238 } else if (lang == tr("Greek", "@item Text character set")) { 0239 return KEncodingProber::Greek; 0240 } else if (lang == tr("Hebrew", "@item Text character set")) { 0241 return KEncodingProber::Hebrew; 0242 } else if (lang == tr("Turkish", "@item Text character set")) { 0243 return KEncodingProber::Turkish; 0244 } else if (lang == tr("Japanese", "@item Text character set")) { 0245 return KEncodingProber::Japanese; 0246 } else if (lang == tr("Baltic", "@item Text character set")) { 0247 return KEncodingProber::Baltic; 0248 } else if (lang == tr("Chinese Traditional", "@item Text character set")) { 0249 return KEncodingProber::ChineseTraditional; 0250 } else if (lang == tr("Chinese Simplified", "@item Text character set")) { 0251 return KEncodingProber::ChineseSimplified; 0252 } else if (lang == tr("Korean", "@item Text character set")) { 0253 return KEncodingProber::Korean; 0254 } else if (lang == tr("Thai", "@item Text character set")) { 0255 return KEncodingProber::Thai; 0256 } else if (lang == tr("Arabic", "@item Text character set")) { 0257 return KEncodingProber::Arabic; 0258 } 0259 0260 return KEncodingProber::Universal; 0261 } 0262 0263 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType) 0264 { 0265 switch (proberType) { 0266 case KEncodingProber::None: 0267 return tr("Disabled", "@item Text character set"); 0268 break; 0269 case KEncodingProber::Universal: 0270 return tr("Universal", "@item Text character set"); 0271 break; 0272 case KEncodingProber::Arabic: 0273 return tr("Arabic", "@item Text character set"); 0274 break; 0275 case KEncodingProber::Baltic: 0276 return tr("Baltic", "@item Text character set"); 0277 break; 0278 case KEncodingProber::CentralEuropean: 0279 return tr("Central European", "@item Text character set"); 0280 break; 0281 case KEncodingProber::Cyrillic: 0282 return tr("Cyrillic", "@item Text character set"); 0283 break; 0284 case KEncodingProber::Greek: 0285 return tr("Greek", "@item Text character set"); 0286 break; 0287 case KEncodingProber::Hebrew: 0288 return tr("Hebrew", "@item Text character set"); 0289 break; 0290 case KEncodingProber::Japanese: 0291 return tr("Japanese", "@item Text character set"); 0292 break; 0293 case KEncodingProber::Turkish: 0294 return tr("Turkish", "@item Text character set"); 0295 break; 0296 case KEncodingProber::WesternEuropean: 0297 return tr("Western European", "@item Text character set"); 0298 break; 0299 case KEncodingProber::ChineseTraditional: 0300 return tr("Chinese Traditional", "@item Text character set"); 0301 break; 0302 case KEncodingProber::ChineseSimplified: 0303 return tr("Chinese Simplified", "@item Text character set"); 0304 break; 0305 case KEncodingProber::Korean: 0306 return tr("Korean", "@item Text character set"); 0307 break; 0308 case KEncodingProber::Thai: 0309 return tr("Thai", "@item Text character set"); 0310 break; 0311 case KEncodingProber::Unicode: 0312 return tr("Unicode", "@item Text character set"); 0313 break; 0314 default: 0315 return QString(); 0316 } 0317 }