File indexing completed on 2024-04-28 07:41:32

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
0005 
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008 
0009 #include "kencodingprober.h"
0010 
0011 #include "probers/ChineseGroupProber.h"
0012 #include "probers/JapaneseGroupProber.h"
0013 #include "probers/UnicodeGroupProber.h"
0014 #include "probers/nsCharSetProber.h"
0015 #include "probers/nsMBCSGroupProber.h"
0016 #include "probers/nsSBCSGroupProber.h"
0017 #include "probers/nsUniversalDetector.h"
0018 
0019 #include <string.h>
0020 
0021 class KEncodingProberPrivate
0022 {
0023 public:
0024     KEncodingProberPrivate()
0025         : mProber(nullptr)
0026         , mStart(true)
0027     {
0028     }
0029     ~KEncodingProberPrivate()
0030     {
0031         delete mProber;
0032     }
0033     void setProberType(KEncodingProber::ProberType pType)
0034     {
0035         mProberType = pType;
0036         /* handle multi-byte encodings carefully , because they're hard to detect,
0037          *   and have to use some Stastics methods.
0038          * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
0039          *   because encoding state machine can detect many such encodings.
0040          */
0041 
0042         delete mProber;
0043 
0044         switch (mProberType) {
0045         case KEncodingProber::None:
0046             mProber = nullptr;
0047             break;
0048         case KEncodingProber::Arabic:
0049         case KEncodingProber::Baltic:
0050         case KEncodingProber::CentralEuropean:
0051         case KEncodingProber::Cyrillic:
0052         case KEncodingProber::Greek:
0053         case KEncodingProber::Hebrew:
0054         case KEncodingProber::NorthernSaami:
0055         case KEncodingProber::Other:
0056         case KEncodingProber::SouthEasternEurope:
0057         case KEncodingProber::Thai:
0058         case KEncodingProber::Turkish:
0059         case KEncodingProber::WesternEuropean:
0060             mProber = new kencodingprober::nsSBCSGroupProber();
0061             break;
0062         case KEncodingProber::ChineseSimplified:
0063         case KEncodingProber::ChineseTraditional:
0064             mProber = new kencodingprober::ChineseGroupProber();
0065             break;
0066         case KEncodingProber::Japanese:
0067             mProber = new kencodingprober::JapaneseGroupProber();
0068             break;
0069         case KEncodingProber::Korean:
0070             mProber = new kencodingprober::nsMBCSGroupProber();
0071             break;
0072         case KEncodingProber::Unicode:
0073             mProber = new kencodingprober::UnicodeGroupProber();
0074             break;
0075         case KEncodingProber::Universal:
0076             mProber = new kencodingprober::nsUniversalDetector();
0077             break;
0078         default:
0079             mProber = nullptr;
0080         }
0081     }
0082     void unicodeTest(const char *aBuf, int aLen)
0083     {
0084         if (mStart) {
0085             mStart = false;
0086             if (aLen > 3) {
0087                 switch (aBuf[0]) {
0088                 case '\xEF':
0089                     if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
0090                     // EF BB BF  UTF-8 encoded BOM
0091                     {
0092                         mProberState = KEncodingProber::FoundIt;
0093                     }
0094                     break;
0095                 case '\xFE':
0096                     if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
0097                     // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
0098                     {
0099                         mProberState = KEncodingProber::FoundIt;
0100                     } else if ('\xFF' == aBuf[1])
0101                     // FE FF  UTF-16, big endian BOM
0102                     {
0103                         mProberState = KEncodingProber::FoundIt;
0104                     }
0105                     break;
0106                 case '\x00':
0107                     if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
0108                     // 00 00 FE FF  UTF-32, big-endian BOM
0109                     {
0110                         mProberState = KEncodingProber::FoundIt;
0111                     } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
0112                     // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
0113                     {
0114                         mProberState = KEncodingProber::FoundIt;
0115                     }
0116                     break;
0117                 case '\xFF':
0118                     if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
0119                     // FF FE 00 00  UTF-32, little-endian BOM
0120                     {
0121                         mProberState = KEncodingProber::FoundIt;
0122                     } else if ('\xFE' == aBuf[1])
0123                     // FF FE  UTF-16, little endian BOM
0124                     {
0125                         mProberState = KEncodingProber::FoundIt;
0126                     }
0127                     break;
0128                 } // switch
0129             }
0130         }
0131     }
0132     KEncodingProber::ProberType mProberType;
0133     KEncodingProber::ProberState mProberState;
0134     kencodingprober::nsCharSetProber *mProber;
0135     bool mStart;
0136 };
0137 
0138 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType)
0139     : d(new KEncodingProberPrivate())
0140 {
0141     setProberType(proberType);
0142 }
0143 
0144 KEncodingProber::~KEncodingProber() = default;
0145 
0146 void KEncodingProber::reset()
0147 {
0148     d->mProberState = KEncodingProber::Probing;
0149     d->mStart = true;
0150 }
0151 
0152 KEncodingProber::ProberState KEncodingProber::feed(QByteArrayView data)
0153 {
0154     if (!d->mProber) {
0155         return d->mProberState;
0156     }
0157     if (d->mProberState == Probing) {
0158         if (d->mStart) {
0159             d->unicodeTest(data.constData(), data.size());
0160             if (d->mProberState == FoundIt) {
0161                 return d->mProberState;
0162             }
0163         }
0164         d->mProber->HandleData(data.constData(), data.size());
0165         switch (d->mProber->GetState()) {
0166         case kencodingprober::eNotMe:
0167             d->mProberState = NotMe;
0168             break;
0169         case kencodingprober::eFoundIt:
0170             d->mProberState = FoundIt;
0171             break;
0172         default:
0173             d->mProberState = Probing;
0174             break;
0175         }
0176     }
0177 #ifdef DEBUG_PROBE
0178     d->mProber->DumpStatus();
0179 #endif
0180     return d->mProberState;
0181 }
0182 
0183 KEncodingProber::ProberState KEncodingProber::state() const
0184 {
0185     return d->mProberState;
0186 }
0187 
0188 QByteArray KEncodingProber::encoding() const
0189 {
0190     if (!d->mProber) {
0191         return QByteArray("UTF-8");
0192     }
0193 
0194     return QByteArray(d->mProber->GetCharSetName());
0195 }
0196 
0197 float KEncodingProber::confidence() const
0198 {
0199     if (!d->mProber) {
0200         return 0.0;
0201     }
0202 
0203     return d->mProber->GetConfidence();
0204 }
0205 
0206 KEncodingProber::ProberType KEncodingProber::proberType() const
0207 {
0208     return d->mProberType;
0209 }
0210 
0211 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
0212 {
0213     d->setProberType(proberType);
0214     reset();
0215 }
0216 
0217 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang)
0218 {
0219     if (lang.isEmpty()) {
0220         return KEncodingProber::Universal;
0221     } else if (lang == tr("Disabled", "@item Text character set")) {
0222         return KEncodingProber::None;
0223     } else if (lang == tr("Universal", "@item Text character set")) {
0224         return KEncodingProber::Universal;
0225     } else if (lang == tr("Unicode", "@item Text character set")) {
0226         return KEncodingProber::Unicode;
0227     } else if (lang == tr("Cyrillic", "@item Text character set")) {
0228         return KEncodingProber::Cyrillic;
0229     } else if (lang == tr("Western European", "@item Text character set")) {
0230         return KEncodingProber::WesternEuropean;
0231     } else if (lang == tr("Central European", "@item Text character set")) {
0232         return KEncodingProber::CentralEuropean;
0233     } else if (lang == tr("Greek", "@item Text character set")) {
0234         return KEncodingProber::Greek;
0235     } else if (lang == tr("Hebrew", "@item Text character set")) {
0236         return KEncodingProber::Hebrew;
0237     } else if (lang == tr("Turkish", "@item Text character set")) {
0238         return KEncodingProber::Turkish;
0239     } else if (lang == tr("Japanese", "@item Text character set")) {
0240         return KEncodingProber::Japanese;
0241     } else if (lang == tr("Baltic", "@item Text character set")) {
0242         return KEncodingProber::Baltic;
0243     } else if (lang == tr("Chinese Traditional", "@item Text character set")) {
0244         return KEncodingProber::ChineseTraditional;
0245     } else if (lang == tr("Chinese Simplified", "@item Text character set")) {
0246         return KEncodingProber::ChineseSimplified;
0247     } else if (lang == tr("Korean", "@item Text character set")) {
0248         return KEncodingProber::Korean;
0249     } else if (lang == tr("Thai", "@item Text character set")) {
0250         return KEncodingProber::Thai;
0251     } else if (lang == tr("Arabic", "@item Text character set")) {
0252         return KEncodingProber::Arabic;
0253     }
0254 
0255     return KEncodingProber::Universal;
0256 }
0257 
0258 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
0259 {
0260     switch (proberType) {
0261     case KEncodingProber::None:
0262         return tr("Disabled", "@item Text character set");
0263         break;
0264     case KEncodingProber::Universal:
0265         return tr("Universal", "@item Text character set");
0266         break;
0267     case KEncodingProber::Arabic:
0268         return tr("Arabic", "@item Text character set");
0269         break;
0270     case KEncodingProber::Baltic:
0271         return tr("Baltic", "@item Text character set");
0272         break;
0273     case KEncodingProber::CentralEuropean:
0274         return tr("Central European", "@item Text character set");
0275         break;
0276     case KEncodingProber::Cyrillic:
0277         return tr("Cyrillic", "@item Text character set");
0278         break;
0279     case KEncodingProber::Greek:
0280         return tr("Greek", "@item Text character set");
0281         break;
0282     case KEncodingProber::Hebrew:
0283         return tr("Hebrew", "@item Text character set");
0284         break;
0285     case KEncodingProber::Japanese:
0286         return tr("Japanese", "@item Text character set");
0287         break;
0288     case KEncodingProber::Turkish:
0289         return tr("Turkish", "@item Text character set");
0290         break;
0291     case KEncodingProber::WesternEuropean:
0292         return tr("Western European", "@item Text character set");
0293         break;
0294     case KEncodingProber::ChineseTraditional:
0295         return tr("Chinese Traditional", "@item Text character set");
0296         break;
0297     case KEncodingProber::ChineseSimplified:
0298         return tr("Chinese Simplified", "@item Text character set");
0299         break;
0300     case KEncodingProber::Korean:
0301         return tr("Korean", "@item Text character set");
0302         break;
0303     case KEncodingProber::Thai:
0304         return tr("Thai", "@item Text character set");
0305         break;
0306     case KEncodingProber::Unicode:
0307         return tr("Unicode", "@item Text character set");
0308         break;
0309     default:
0310         return QString();
0311     }
0312 }