File indexing completed on 2024-04-28 03:53:04

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003     SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
0004 
0005     SPDX-License-Identifier: MIT
0006 */
0007 
0008 #include "nsUniversalDetector.h"
0009 
0010 #include "nsEscCharsetProber.h"
0011 #include "nsLatin1Prober.h"
0012 #include "nsMBCSGroupProber.h"
0013 #include "nsSBCSGroupProber.h"
0014 
0015 namespace kencodingprober
0016 {
0017 nsUniversalDetector::nsUniversalDetector()
0018 {
0019     mDone = false;
0020     mBestGuess = -1; // illegal value as signal
0021     mInTag = false;
0022     mEscCharSetProber = nullptr;
0023 
0024     mStart = true;
0025     mDetectedCharset = nullptr;
0026     mGotData = false;
0027     mInputState = ePureAscii;
0028     mLastChar = '\0';
0029 
0030     unsigned int i;
0031     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
0032         mCharSetProbers[i] = nullptr;
0033     }
0034 }
0035 
0036 nsUniversalDetector::~nsUniversalDetector()
0037 {
0038     for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
0039         delete mCharSetProbers[i];
0040     }
0041     delete mEscCharSetProber;
0042 }
0043 
0044 void nsUniversalDetector::Reset()
0045 {
0046     mDone = false;
0047     mBestGuess = -1; // illegal value as signal
0048     mInTag = false;
0049 
0050     mStart = true;
0051     mDetectedCharset = nullptr;
0052     mGotData = false;
0053     mInputState = ePureAscii;
0054     mLastChar = '\0';
0055 
0056     if (mEscCharSetProber) {
0057         mEscCharSetProber->Reset();
0058     }
0059 
0060     unsigned int i;
0061     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
0062         if (mCharSetProbers[i]) {
0063             mCharSetProbers[i]->Reset();
0064         }
0065     }
0066 }
0067 
0068 //---------------------------------------------------------------------
0069 #define SHORTCUT_THRESHOLD (float)0.95
0070 #define MINIMUM_THRESHOLD (float)0.20
0071 
0072 nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen)
0073 {
0074     if (mDone) {
0075         return eFoundIt;
0076     }
0077 
0078     if (aLen > 0) {
0079         mGotData = true;
0080     }
0081 
0082     unsigned int i;
0083     for (i = 0; i < aLen; i++) {
0084         // other than 0xa0, if every other character is ascii, the page is ascii
0085         if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP
0086             // we got a non-ascii byte (high-byte)
0087             if (mInputState != eHighbyte) {
0088                 // adjust state
0089                 mInputState = eHighbyte;
0090 
0091                 // kill mEscCharSetProber if it is active
0092                 delete mEscCharSetProber;
0093                 mEscCharSetProber = nullptr;
0094 
0095                 // start multibyte and singlebyte charset prober
0096                 if (nullptr == mCharSetProbers[0]) {
0097                     mCharSetProbers[0] = new nsMBCSGroupProber;
0098                 }
0099                 if (nullptr == mCharSetProbers[1]) {
0100                     mCharSetProbers[1] = new nsSBCSGroupProber;
0101                 }
0102                 if (nullptr == mCharSetProbers[2]) {
0103                     mCharSetProbers[2] = new nsLatin1Prober;
0104                 }
0105             }
0106         } else {
0107             // ok, just pure ascii so far
0108             if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) {
0109                 // found escape character or HZ "~{"
0110                 mInputState = eEscAscii;
0111             }
0112 
0113             mLastChar = aBuf[i];
0114         }
0115     }
0116 
0117     nsProbingState st = eDetecting;
0118     switch (mInputState) {
0119     case eEscAscii:
0120         if (nullptr == mEscCharSetProber) {
0121             mEscCharSetProber = new nsEscCharSetProber;
0122         }
0123         st = mEscCharSetProber->HandleData(aBuf, aLen);
0124         if (st == eFoundIt) {
0125             mDone = true;
0126             mDetectedCharset = mEscCharSetProber->GetCharSetName();
0127         }
0128         break;
0129     case eHighbyte:
0130         for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) {
0131             st = mCharSetProbers[i]->HandleData(aBuf, aLen);
0132             if (st == eFoundIt) {
0133                 mDone = true;
0134                 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
0135             }
0136         }
0137         break;
0138 
0139     default: // pure ascii
0140         mDetectedCharset = "UTF-8";
0141     }
0142     return st;
0143 }
0144 
0145 //---------------------------------------------------------------------
0146 const char *nsUniversalDetector::GetCharSetName()
0147 {
0148     if (mDetectedCharset) {
0149         return mDetectedCharset;
0150     }
0151     switch (mInputState) {
0152     case eHighbyte: {
0153         float proberConfidence;
0154         float maxProberConfidence = (float)0.0;
0155         int maxProber = 0;
0156 
0157         for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
0158             proberConfidence = mCharSetProbers[i]->GetConfidence();
0159             if (proberConfidence > maxProberConfidence) {
0160                 maxProberConfidence = proberConfidence;
0161                 maxProber = i;
0162             }
0163         }
0164         // do not report anything because we are not confident of it, that's in fact a negative answer
0165         if (maxProberConfidence > MINIMUM_THRESHOLD) {
0166             return mCharSetProbers[maxProber]->GetCharSetName();
0167         }
0168     }
0169     case eEscAscii:
0170         break;
0171     default: // pure ascii
0172              ;
0173     }
0174     return "UTF-8";
0175 }
0176 
0177 //---------------------------------------------------------------------
0178 float nsUniversalDetector::GetConfidence()
0179 {
0180     if (!mGotData) {
0181         // we haven't got any data yet, return immediately
0182         // caller program sometimes call DataEnd before anything has been sent to detector
0183         return MINIMUM_THRESHOLD;
0184     }
0185     if (mDetectedCharset) {
0186         return 0.99f;
0187     }
0188     switch (mInputState) {
0189     case eHighbyte: {
0190         float proberConfidence;
0191         float maxProberConfidence = (float)0.0;
0192         int maxProber = 0;
0193 
0194         for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
0195             proberConfidence = mCharSetProbers[i]->GetConfidence();
0196             if (proberConfidence > maxProberConfidence) {
0197                 maxProberConfidence = proberConfidence;
0198                 maxProber = i;
0199             }
0200         }
0201         // do not report anything because we are not confident of it, that's in fact a negative answer
0202         if (maxProberConfidence > MINIMUM_THRESHOLD) {
0203             return mCharSetProbers[maxProber]->GetConfidence();
0204         }
0205     }
0206     case eEscAscii:
0207         break;
0208     default: // pure ascii
0209              ;
0210     }
0211     return MINIMUM_THRESHOLD;
0212 }
0213 
0214 nsProbingState nsUniversalDetector::GetState()
0215 {
0216     if (mDone) {
0217         return eFoundIt;
0218     } else {
0219         return eDetecting;
0220     }
0221 }
0222 }