File indexing completed on 2024-03-24 03:55:50
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 0009 #include "kencodingprober.h" 0010 0011 #include "probers/ChineseGroupProber.h" 0012 #include "probers/JapaneseGroupProber.h" 0013 #include "probers/UnicodeGroupProber.h" 0014 #include "probers/nsCharSetProber.h" 0015 #include "probers/nsMBCSGroupProber.h" 0016 #include "probers/nsSBCSGroupProber.h" 0017 #include "probers/nsUniversalDetector.h" 0018 0019 #include <string.h> 0020 0021 class KEncodingProberPrivate 0022 { 0023 public: 0024 KEncodingProberPrivate() 0025 : mProber(nullptr) 0026 , mStart(true) 0027 { 0028 } 0029 ~KEncodingProberPrivate() 0030 { 0031 delete mProber; 0032 } 0033 void setProberType(KEncodingProber::ProberType pType) 0034 { 0035 mProberType = pType; 0036 /* handle multi-byte encodings carefully , because they're hard to detect, 0037 * and have to use some Stastics methods. 0038 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok, 0039 * because encoding state machine can detect many such encodings. 0040 */ 0041 0042 delete mProber; 0043 0044 switch (mProberType) { 0045 case KEncodingProber::None: 0046 mProber = nullptr; 0047 break; 0048 case KEncodingProber::Arabic: 0049 case KEncodingProber::Baltic: 0050 case KEncodingProber::CentralEuropean: 0051 case KEncodingProber::Cyrillic: 0052 case KEncodingProber::Greek: 0053 case KEncodingProber::Hebrew: 0054 case KEncodingProber::NorthernSaami: 0055 case KEncodingProber::Other: 0056 case KEncodingProber::SouthEasternEurope: 0057 case KEncodingProber::Thai: 0058 case KEncodingProber::Turkish: 0059 case KEncodingProber::WesternEuropean: 0060 mProber = new kencodingprober::nsSBCSGroupProber(); 0061 break; 0062 case KEncodingProber::ChineseSimplified: 0063 case KEncodingProber::ChineseTraditional: 0064 mProber = new kencodingprober::ChineseGroupProber(); 0065 break; 0066 case KEncodingProber::Japanese: 0067 mProber = new kencodingprober::JapaneseGroupProber(); 0068 break; 0069 case KEncodingProber::Korean: 0070 mProber = new kencodingprober::nsMBCSGroupProber(); 0071 break; 0072 case KEncodingProber::Unicode: 0073 mProber = new kencodingprober::UnicodeGroupProber(); 0074 break; 0075 case KEncodingProber::Universal: 0076 mProber = new kencodingprober::nsUniversalDetector(); 0077 break; 0078 default: 0079 mProber = nullptr; 0080 } 0081 } 0082 void unicodeTest(const char *aBuf, int aLen) 0083 { 0084 if (mStart) { 0085 mStart = false; 0086 if (aLen > 3) { 0087 switch (aBuf[0]) { 0088 case '\xEF': 0089 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 0090 // EF BB BF UTF-8 encoded BOM 0091 { 0092 mProberState = KEncodingProber::FoundIt; 0093 } 0094 break; 0095 case '\xFE': 0096 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 0097 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 0098 { 0099 mProberState = KEncodingProber::FoundIt; 0100 } else if ('\xFF' == aBuf[1]) 0101 // FE FF UTF-16, big endian BOM 0102 { 0103 mProberState = KEncodingProber::FoundIt; 0104 } 0105 break; 0106 case '\x00': 0107 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 0108 // 00 00 FE FF UTF-32, big-endian BOM 0109 { 0110 mProberState = KEncodingProber::FoundIt; 0111 } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 0112 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 0113 { 0114 mProberState = KEncodingProber::FoundIt; 0115 } 0116 break; 0117 case '\xFF': 0118 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 0119 // FF FE 00 00 UTF-32, little-endian BOM 0120 { 0121 mProberState = KEncodingProber::FoundIt; 0122 } else if ('\xFE' == aBuf[1]) 0123 // FF FE UTF-16, little endian BOM 0124 { 0125 mProberState = KEncodingProber::FoundIt; 0126 } 0127 break; 0128 } // switch 0129 } 0130 } 0131 } 0132 KEncodingProber::ProberType mProberType; 0133 KEncodingProber::ProberState mProberState; 0134 kencodingprober::nsCharSetProber *mProber; 0135 bool mStart; 0136 }; 0137 0138 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType) 0139 : d(new KEncodingProberPrivate()) 0140 { 0141 setProberType(proberType); 0142 } 0143 0144 KEncodingProber::~KEncodingProber() = default; 0145 0146 void KEncodingProber::reset() 0147 { 0148 d->mProberState = KEncodingProber::Probing; 0149 d->mStart = true; 0150 } 0151 0152 KEncodingProber::ProberState KEncodingProber::feed(QByteArrayView data) 0153 { 0154 if (!d->mProber) { 0155 return d->mProberState; 0156 } 0157 if (d->mProberState == Probing) { 0158 if (d->mStart) { 0159 d->unicodeTest(data.constData(), data.size()); 0160 if (d->mProberState == FoundIt) { 0161 return d->mProberState; 0162 } 0163 } 0164 d->mProber->HandleData(data.constData(), data.size()); 0165 switch (d->mProber->GetState()) { 0166 case kencodingprober::eNotMe: 0167 d->mProberState = NotMe; 0168 break; 0169 case kencodingprober::eFoundIt: 0170 d->mProberState = FoundIt; 0171 break; 0172 default: 0173 d->mProberState = Probing; 0174 break; 0175 } 0176 } 0177 #ifdef DEBUG_PROBE 0178 d->mProber->DumpStatus(); 0179 #endif 0180 return d->mProberState; 0181 } 0182 0183 KEncodingProber::ProberState KEncodingProber::state() const 0184 { 0185 return d->mProberState; 0186 } 0187 0188 QByteArray KEncodingProber::encoding() const 0189 { 0190 if (!d->mProber) { 0191 return QByteArray("UTF-8"); 0192 } 0193 0194 return QByteArray(d->mProber->GetCharSetName()); 0195 } 0196 0197 float KEncodingProber::confidence() const 0198 { 0199 if (!d->mProber) { 0200 return 0.0; 0201 } 0202 0203 return d->mProber->GetConfidence(); 0204 } 0205 0206 KEncodingProber::ProberType KEncodingProber::proberType() const 0207 { 0208 return d->mProberType; 0209 } 0210 0211 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType) 0212 { 0213 d->setProberType(proberType); 0214 reset(); 0215 } 0216 0217 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang) 0218 { 0219 if (lang.isEmpty()) { 0220 return KEncodingProber::Universal; 0221 } else if (lang == tr("Disabled", "@item Text character set")) { 0222 return KEncodingProber::None; 0223 } else if (lang == tr("Universal", "@item Text character set")) { 0224 return KEncodingProber::Universal; 0225 } else if (lang == tr("Unicode", "@item Text character set")) { 0226 return KEncodingProber::Unicode; 0227 } else if (lang == tr("Cyrillic", "@item Text character set")) { 0228 return KEncodingProber::Cyrillic; 0229 } else if (lang == tr("Western European", "@item Text character set")) { 0230 return KEncodingProber::WesternEuropean; 0231 } else if (lang == tr("Central European", "@item Text character set")) { 0232 return KEncodingProber::CentralEuropean; 0233 } else if (lang == tr("Greek", "@item Text character set")) { 0234 return KEncodingProber::Greek; 0235 } else if (lang == tr("Hebrew", "@item Text character set")) { 0236 return KEncodingProber::Hebrew; 0237 } else if (lang == tr("Turkish", "@item Text character set")) { 0238 return KEncodingProber::Turkish; 0239 } else if (lang == tr("Japanese", "@item Text character set")) { 0240 return KEncodingProber::Japanese; 0241 } else if (lang == tr("Baltic", "@item Text character set")) { 0242 return KEncodingProber::Baltic; 0243 } else if (lang == tr("Chinese Traditional", "@item Text character set")) { 0244 return KEncodingProber::ChineseTraditional; 0245 } else if (lang == tr("Chinese Simplified", "@item Text character set")) { 0246 return KEncodingProber::ChineseSimplified; 0247 } else if (lang == tr("Korean", "@item Text character set")) { 0248 return KEncodingProber::Korean; 0249 } else if (lang == tr("Thai", "@item Text character set")) { 0250 return KEncodingProber::Thai; 0251 } else if (lang == tr("Arabic", "@item Text character set")) { 0252 return KEncodingProber::Arabic; 0253 } 0254 0255 return KEncodingProber::Universal; 0256 } 0257 0258 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType) 0259 { 0260 switch (proberType) { 0261 case KEncodingProber::None: 0262 return tr("Disabled", "@item Text character set"); 0263 break; 0264 case KEncodingProber::Universal: 0265 return tr("Universal", "@item Text character set"); 0266 break; 0267 case KEncodingProber::Arabic: 0268 return tr("Arabic", "@item Text character set"); 0269 break; 0270 case KEncodingProber::Baltic: 0271 return tr("Baltic", "@item Text character set"); 0272 break; 0273 case KEncodingProber::CentralEuropean: 0274 return tr("Central European", "@item Text character set"); 0275 break; 0276 case KEncodingProber::Cyrillic: 0277 return tr("Cyrillic", "@item Text character set"); 0278 break; 0279 case KEncodingProber::Greek: 0280 return tr("Greek", "@item Text character set"); 0281 break; 0282 case KEncodingProber::Hebrew: 0283 return tr("Hebrew", "@item Text character set"); 0284 break; 0285 case KEncodingProber::Japanese: 0286 return tr("Japanese", "@item Text character set"); 0287 break; 0288 case KEncodingProber::Turkish: 0289 return tr("Turkish", "@item Text character set"); 0290 break; 0291 case KEncodingProber::WesternEuropean: 0292 return tr("Western European", "@item Text character set"); 0293 break; 0294 case KEncodingProber::ChineseTraditional: 0295 return tr("Chinese Traditional", "@item Text character set"); 0296 break; 0297 case KEncodingProber::ChineseSimplified: 0298 return tr("Chinese Simplified", "@item Text character set"); 0299 break; 0300 case KEncodingProber::Korean: 0301 return tr("Korean", "@item Text character set"); 0302 break; 0303 case KEncodingProber::Thai: 0304 return tr("Thai", "@item Text character set"); 0305 break; 0306 case KEncodingProber::Unicode: 0307 return tr("Unicode", "@item Text character set"); 0308 break; 0309 default: 0310 return QString(); 0311 } 0312 }