File indexing completed on 2024-04-21 14:53:58

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
0005 
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008 
0009 #include "kencodingprober.h"
0010 
0011 #include "probers/ChineseGroupProber.h"
0012 #include "probers/JapaneseGroupProber.h"
0013 #include "probers/UnicodeGroupProber.h"
0014 #include "probers/nsCharSetProber.h"
0015 #include "probers/nsMBCSGroupProber.h"
0016 #include "probers/nsSBCSGroupProber.h"
0017 #include "probers/nsUniversalDetector.h"
0018 
0019 #include <string.h>
0020 
0021 class KEncodingProberPrivate
0022 {
0023 public:
0024     KEncodingProberPrivate()
0025         : mProber(nullptr)
0026         , mStart(true)
0027     {
0028     }
0029     ~KEncodingProberPrivate()
0030     {
0031         delete mProber;
0032     }
0033     void setProberType(KEncodingProber::ProberType pType)
0034     {
0035         mProberType = pType;
0036         /* handle multi-byte encodings carefully , because they're hard to detect,
0037          *   and have to use some Stastics methods.
0038          * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
0039          *   because encoding state machine can detect many such encodings.
0040          */
0041 
0042         delete mProber;
0043 
0044         switch (mProberType) {
0045         case KEncodingProber::None:
0046             mProber = nullptr;
0047             break;
0048         case KEncodingProber::Arabic:
0049         case KEncodingProber::Baltic:
0050         case KEncodingProber::CentralEuropean:
0051         case KEncodingProber::Cyrillic:
0052         case KEncodingProber::Greek:
0053         case KEncodingProber::Hebrew:
0054         case KEncodingProber::NorthernSaami:
0055         case KEncodingProber::Other:
0056         case KEncodingProber::SouthEasternEurope:
0057         case KEncodingProber::Thai:
0058         case KEncodingProber::Turkish:
0059         case KEncodingProber::WesternEuropean:
0060             mProber = new kencodingprober::nsSBCSGroupProber();
0061             break;
0062         case KEncodingProber::ChineseSimplified:
0063         case KEncodingProber::ChineseTraditional:
0064             mProber = new kencodingprober::ChineseGroupProber();
0065             break;
0066         case KEncodingProber::Japanese:
0067             mProber = new kencodingprober::JapaneseGroupProber();
0068             break;
0069         case KEncodingProber::Korean:
0070             mProber = new kencodingprober::nsMBCSGroupProber();
0071             break;
0072         case KEncodingProber::Unicode:
0073             mProber = new kencodingprober::UnicodeGroupProber();
0074             break;
0075         case KEncodingProber::Universal:
0076             mProber = new kencodingprober::nsUniversalDetector();
0077             break;
0078         default:
0079             mProber = nullptr;
0080         }
0081     }
0082     void unicodeTest(const char *aBuf, int aLen)
0083     {
0084         if (mStart) {
0085             mStart = false;
0086             if (aLen > 3) {
0087                 switch (aBuf[0]) {
0088                 case '\xEF':
0089                     if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
0090                     // EF BB BF  UTF-8 encoded BOM
0091                     {
0092                         mProberState = KEncodingProber::FoundIt;
0093                     }
0094                     break;
0095                 case '\xFE':
0096                     if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
0097                     // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
0098                     {
0099                         mProberState = KEncodingProber::FoundIt;
0100                     } else if ('\xFF' == aBuf[1])
0101                     // FE FF  UTF-16, big endian BOM
0102                     {
0103                         mProberState = KEncodingProber::FoundIt;
0104                     }
0105                     break;
0106                 case '\x00':
0107                     if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
0108                     // 00 00 FE FF  UTF-32, big-endian BOM
0109                     {
0110                         mProberState = KEncodingProber::FoundIt;
0111                     } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
0112                     // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
0113                     {
0114                         mProberState = KEncodingProber::FoundIt;
0115                     }
0116                     break;
0117                 case '\xFF':
0118                     if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
0119                     // FF FE 00 00  UTF-32, little-endian BOM
0120                     {
0121                         mProberState = KEncodingProber::FoundIt;
0122                     } else if ('\xFE' == aBuf[1])
0123                     // FF FE  UTF-16, little endian BOM
0124                     {
0125                         mProberState = KEncodingProber::FoundIt;
0126                     }
0127                     break;
0128                 } // switch
0129             }
0130         }
0131     }
0132     KEncodingProber::ProberType mProberType;
0133     KEncodingProber::ProberState mProberState;
0134     kencodingprober::nsCharSetProber *mProber;
0135     bool mStart;
0136 };
0137 
0138 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType)
0139     : d(new KEncodingProberPrivate())
0140 {
0141     setProberType(proberType);
0142 }
0143 
0144 KEncodingProber::~KEncodingProber() = default;
0145 
0146 void KEncodingProber::reset()
0147 {
0148     d->mProberState = KEncodingProber::Probing;
0149     d->mStart = true;
0150 }
0151 
0152 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
0153 {
0154     return feed(data.data(), data.size());
0155 }
0156 
0157 KEncodingProber::ProberState KEncodingProber::feed(const char *data, int len)
0158 {
0159     if (!d->mProber) {
0160         return d->mProberState;
0161     }
0162     if (d->mProberState == Probing) {
0163         if (d->mStart) {
0164             d->unicodeTest(data, len);
0165             if (d->mProberState == FoundIt) {
0166                 return d->mProberState;
0167             }
0168         }
0169         d->mProber->HandleData(data, len);
0170         switch (d->mProber->GetState()) {
0171         case kencodingprober::eNotMe:
0172             d->mProberState = NotMe;
0173             break;
0174         case kencodingprober::eFoundIt:
0175             d->mProberState = FoundIt;
0176             break;
0177         default:
0178             d->mProberState = Probing;
0179             break;
0180         }
0181     }
0182 #ifdef DEBUG_PROBE
0183     d->mProber->DumpStatus();
0184 #endif
0185     return d->mProberState;
0186 }
0187 
0188 KEncodingProber::ProberState KEncodingProber::state() const
0189 {
0190     return d->mProberState;
0191 }
0192 
0193 QByteArray KEncodingProber::encoding() const
0194 {
0195     if (!d->mProber) {
0196         return QByteArray("UTF-8");
0197     }
0198 
0199     return QByteArray(d->mProber->GetCharSetName());
0200 }
0201 
0202 float KEncodingProber::confidence() const
0203 {
0204     if (!d->mProber) {
0205         return 0.0;
0206     }
0207 
0208     return d->mProber->GetConfidence();
0209 }
0210 
0211 KEncodingProber::ProberType KEncodingProber::proberType() const
0212 {
0213     return d->mProberType;
0214 }
0215 
0216 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
0217 {
0218     d->setProberType(proberType);
0219     reset();
0220 }
0221 
0222 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang)
0223 {
0224     if (lang.isEmpty()) {
0225         return KEncodingProber::Universal;
0226     } else if (lang == tr("Disabled", "@item Text character set")) {
0227         return KEncodingProber::None;
0228     } else if (lang == tr("Universal", "@item Text character set")) {
0229         return KEncodingProber::Universal;
0230     } else if (lang == tr("Unicode", "@item Text character set")) {
0231         return KEncodingProber::Unicode;
0232     } else if (lang == tr("Cyrillic", "@item Text character set")) {
0233         return KEncodingProber::Cyrillic;
0234     } else if (lang == tr("Western European", "@item Text character set")) {
0235         return KEncodingProber::WesternEuropean;
0236     } else if (lang == tr("Central European", "@item Text character set")) {
0237         return KEncodingProber::CentralEuropean;
0238     } else if (lang == tr("Greek", "@item Text character set")) {
0239         return KEncodingProber::Greek;
0240     } else if (lang == tr("Hebrew", "@item Text character set")) {
0241         return KEncodingProber::Hebrew;
0242     } else if (lang == tr("Turkish", "@item Text character set")) {
0243         return KEncodingProber::Turkish;
0244     } else if (lang == tr("Japanese", "@item Text character set")) {
0245         return KEncodingProber::Japanese;
0246     } else if (lang == tr("Baltic", "@item Text character set")) {
0247         return KEncodingProber::Baltic;
0248     } else if (lang == tr("Chinese Traditional", "@item Text character set")) {
0249         return KEncodingProber::ChineseTraditional;
0250     } else if (lang == tr("Chinese Simplified", "@item Text character set")) {
0251         return KEncodingProber::ChineseSimplified;
0252     } else if (lang == tr("Korean", "@item Text character set")) {
0253         return KEncodingProber::Korean;
0254     } else if (lang == tr("Thai", "@item Text character set")) {
0255         return KEncodingProber::Thai;
0256     } else if (lang == tr("Arabic", "@item Text character set")) {
0257         return KEncodingProber::Arabic;
0258     }
0259 
0260     return KEncodingProber::Universal;
0261 }
0262 
0263 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
0264 {
0265     switch (proberType) {
0266     case KEncodingProber::None:
0267         return tr("Disabled", "@item Text character set");
0268         break;
0269     case KEncodingProber::Universal:
0270         return tr("Universal", "@item Text character set");
0271         break;
0272     case KEncodingProber::Arabic:
0273         return tr("Arabic", "@item Text character set");
0274         break;
0275     case KEncodingProber::Baltic:
0276         return tr("Baltic", "@item Text character set");
0277         break;
0278     case KEncodingProber::CentralEuropean:
0279         return tr("Central European", "@item Text character set");
0280         break;
0281     case KEncodingProber::Cyrillic:
0282         return tr("Cyrillic", "@item Text character set");
0283         break;
0284     case KEncodingProber::Greek:
0285         return tr("Greek", "@item Text character set");
0286         break;
0287     case KEncodingProber::Hebrew:
0288         return tr("Hebrew", "@item Text character set");
0289         break;
0290     case KEncodingProber::Japanese:
0291         return tr("Japanese", "@item Text character set");
0292         break;
0293     case KEncodingProber::Turkish:
0294         return tr("Turkish", "@item Text character set");
0295         break;
0296     case KEncodingProber::WesternEuropean:
0297         return tr("Western European", "@item Text character set");
0298         break;
0299     case KEncodingProber::ChineseTraditional:
0300         return tr("Chinese Traditional", "@item Text character set");
0301         break;
0302     case KEncodingProber::ChineseSimplified:
0303         return tr("Chinese Simplified", "@item Text character set");
0304         break;
0305     case KEncodingProber::Korean:
0306         return tr("Korean", "@item Text character set");
0307         break;
0308     case KEncodingProber::Thai:
0309         return tr("Thai", "@item Text character set");
0310         break;
0311     case KEncodingProber::Unicode:
0312         return tr("Unicode", "@item Text character set");
0313         break;
0314     default:
0315         return QString();
0316     }
0317 }