File indexing completed on 2024-05-05 16:05:51
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com> 0004 0005 SPDX-License-Identifier: MIT 0006 */ 0007 0008 #include "nsUniversalDetector.h" 0009 0010 #include "nsEscCharsetProber.h" 0011 #include "nsLatin1Prober.h" 0012 #include "nsMBCSGroupProber.h" 0013 #include "nsSBCSGroupProber.h" 0014 0015 namespace kencodingprober 0016 { 0017 nsUniversalDetector::nsUniversalDetector() 0018 { 0019 mDone = false; 0020 mBestGuess = -1; // illegal value as signal 0021 mInTag = false; 0022 mEscCharSetProber = nullptr; 0023 0024 mStart = true; 0025 mDetectedCharset = nullptr; 0026 mGotData = false; 0027 mInputState = ePureAscii; 0028 mLastChar = '\0'; 0029 0030 unsigned int i; 0031 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 0032 mCharSetProbers[i] = nullptr; 0033 } 0034 } 0035 0036 nsUniversalDetector::~nsUniversalDetector() 0037 { 0038 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 0039 delete mCharSetProbers[i]; 0040 } 0041 delete mEscCharSetProber; 0042 } 0043 0044 void nsUniversalDetector::Reset() 0045 { 0046 mDone = false; 0047 mBestGuess = -1; // illegal value as signal 0048 mInTag = false; 0049 0050 mStart = true; 0051 mDetectedCharset = nullptr; 0052 mGotData = false; 0053 mInputState = ePureAscii; 0054 mLastChar = '\0'; 0055 0056 if (mEscCharSetProber) { 0057 mEscCharSetProber->Reset(); 0058 } 0059 0060 unsigned int i; 0061 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 0062 if (mCharSetProbers[i]) { 0063 mCharSetProbers[i]->Reset(); 0064 } 0065 } 0066 } 0067 0068 //--------------------------------------------------------------------- 0069 #define SHORTCUT_THRESHOLD (float)0.95 0070 #define MINIMUM_THRESHOLD (float)0.20 0071 0072 nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen) 0073 { 0074 if (mDone) { 0075 return eFoundIt; 0076 } 0077 0078 if (aLen > 0) { 0079 mGotData = true; 0080 } 0081 0082 unsigned int i; 0083 for (i = 0; i < aLen; i++) { 0084 // other than 0xa0, if every other character is ascii, the page is ascii 0085 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP 0086 // we got a non-ascii byte (high-byte) 0087 if (mInputState != eHighbyte) { 0088 // adjust state 0089 mInputState = eHighbyte; 0090 0091 // kill mEscCharSetProber if it is active 0092 delete mEscCharSetProber; 0093 mEscCharSetProber = nullptr; 0094 0095 // start multibyte and singlebyte charset prober 0096 if (nullptr == mCharSetProbers[0]) { 0097 mCharSetProbers[0] = new nsMBCSGroupProber; 0098 } 0099 if (nullptr == mCharSetProbers[1]) { 0100 mCharSetProbers[1] = new nsSBCSGroupProber; 0101 } 0102 if (nullptr == mCharSetProbers[2]) { 0103 mCharSetProbers[2] = new nsLatin1Prober; 0104 } 0105 } 0106 } else { 0107 // ok, just pure ascii so far 0108 if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) { 0109 // found escape character or HZ "~{" 0110 mInputState = eEscAscii; 0111 } 0112 0113 mLastChar = aBuf[i]; 0114 } 0115 } 0116 0117 nsProbingState st = eDetecting; 0118 switch (mInputState) { 0119 case eEscAscii: 0120 if (nullptr == mEscCharSetProber) { 0121 mEscCharSetProber = new nsEscCharSetProber; 0122 } 0123 st = mEscCharSetProber->HandleData(aBuf, aLen); 0124 if (st == eFoundIt) { 0125 mDone = true; 0126 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 0127 } 0128 break; 0129 case eHighbyte: 0130 for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) { 0131 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 0132 if (st == eFoundIt) { 0133 mDone = true; 0134 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 0135 } 0136 } 0137 break; 0138 0139 default: // pure ascii 0140 mDetectedCharset = "UTF-8"; 0141 } 0142 return st; 0143 } 0144 0145 //--------------------------------------------------------------------- 0146 const char *nsUniversalDetector::GetCharSetName() 0147 { 0148 if (mDetectedCharset) { 0149 return mDetectedCharset; 0150 } 0151 switch (mInputState) { 0152 case eHighbyte: { 0153 float proberConfidence; 0154 float maxProberConfidence = (float)0.0; 0155 int maxProber = 0; 0156 0157 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 0158 proberConfidence = mCharSetProbers[i]->GetConfidence(); 0159 if (proberConfidence > maxProberConfidence) { 0160 maxProberConfidence = proberConfidence; 0161 maxProber = i; 0162 } 0163 } 0164 // do not report anything because we are not confident of it, that's in fact a negative answer 0165 if (maxProberConfidence > MINIMUM_THRESHOLD) { 0166 return mCharSetProbers[maxProber]->GetCharSetName(); 0167 } 0168 } 0169 case eEscAscii: 0170 break; 0171 default: // pure ascii 0172 ; 0173 } 0174 return "UTF-8"; 0175 } 0176 0177 //--------------------------------------------------------------------- 0178 float nsUniversalDetector::GetConfidence() 0179 { 0180 if (!mGotData) { 0181 // we haven't got any data yet, return immediately 0182 // caller program sometimes call DataEnd before anything has been sent to detector 0183 return MINIMUM_THRESHOLD; 0184 } 0185 if (mDetectedCharset) { 0186 return 0.99f; 0187 } 0188 switch (mInputState) { 0189 case eHighbyte: { 0190 float proberConfidence; 0191 float maxProberConfidence = (float)0.0; 0192 int maxProber = 0; 0193 0194 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { 0195 proberConfidence = mCharSetProbers[i]->GetConfidence(); 0196 if (proberConfidence > maxProberConfidence) { 0197 maxProberConfidence = proberConfidence; 0198 maxProber = i; 0199 } 0200 } 0201 // do not report anything because we are not confident of it, that's in fact a negative answer 0202 if (maxProberConfidence > MINIMUM_THRESHOLD) { 0203 return mCharSetProbers[maxProber]->GetConfidence(); 0204 } 0205 } 0206 case eEscAscii: 0207 break; 0208 default: // pure ascii 0209 ; 0210 } 0211 return MINIMUM_THRESHOLD; 0212 } 0213 0214 nsProbingState nsUniversalDetector::GetState() 0215 { 0216 if (mDone) { 0217 return eFoundIt; 0218 } else { 0219 return eDetecting; 0220 } 0221 } 0222 }