File indexing completed on 2024-04-28 03:53:04
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsSBCSGroupProber.h" 0008 0009 #include "UnicodeGroupProber.h" 0010 #include "nsHebrewProber.h" 0011 #include "nsSBCharSetProber.h" 0012 0013 #include <stdio.h> 0014 #include <stdlib.h> 0015 0016 namespace kencodingprober 0017 { 0018 nsSBCSGroupProber::nsSBCSGroupProber() 0019 { 0020 mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); 0021 mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); 0022 mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); 0023 mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); 0024 mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); 0025 mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); 0026 mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); 0027 mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); 0028 mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); 0029 mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); 0030 0031 nsHebrewProber *hebprober = new nsHebrewProber(); 0032 // Notice: Any change in these indexes - 10,11,12 must be reflected 0033 // in the code below as well. 0034 mProbers[10] = hebprober; 0035 mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew 0036 mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew 0037 mProbers[13] = new UnicodeGroupProber(); 0038 0039 // Tell the Hebrew prober about the logical and visual probers 0040 if (mProbers[10] && mProbers[11] && mProbers[12]) { // all are not null 0041 hebprober->SetModelProbers(mProbers[11], mProbers[12]); 0042 } else { // One or more is null. avoid any Hebrew probing, null them all 0043 for (unsigned int i = 10; i <= 12; ++i) { 0044 delete mProbers[i]; 0045 mProbers[i] = nullptr; 0046 } 0047 } 0048 0049 // disable latin2 before latin1 is available, otherwise all latin1 0050 // will be detected as latin2 because of their similarity. 0051 // mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); 0052 // mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); 0053 0054 Reset(); 0055 } 0056 0057 nsSBCSGroupProber::~nsSBCSGroupProber() 0058 { 0059 for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) { 0060 delete mProbers[i]; 0061 } 0062 } 0063 0064 const char *nsSBCSGroupProber::GetCharSetName() 0065 { 0066 // if we have no answer yet 0067 if (mBestGuess == -1) { 0068 GetConfidence(); 0069 // no charset seems positive 0070 if (mBestGuess == -1) 0071 // we will use default. 0072 { 0073 mBestGuess = 0; 0074 } 0075 } 0076 return mProbers[mBestGuess]->GetCharSetName(); 0077 } 0078 0079 void nsSBCSGroupProber::Reset(void) 0080 { 0081 mActiveNum = 0; 0082 for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) { 0083 if (mProbers[i]) { // not null 0084 mProbers[i]->Reset(); 0085 mIsActive[i] = true; 0086 ++mActiveNum; 0087 } else { 0088 mIsActive[i] = false; 0089 } 0090 } 0091 mBestGuess = -1; 0092 mState = eDetecting; 0093 } 0094 0095 nsProbingState nsSBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen) 0096 { 0097 nsProbingState st; 0098 unsigned int i; 0099 char *newBuf1 = nullptr; 0100 unsigned int newLen1 = 0; 0101 0102 // apply filter to original buffer, and we got new buffer back 0103 // depend on what script it is, we will feed them the new buffer 0104 // we got after applying proper filter 0105 // this is done without any consideration to KeepEnglishLetters 0106 // of each prober since as of now, there are no probers here which 0107 // recognize languages with English characters. 0108 if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 0109 goto done; 0110 } 0111 0112 if (newLen1 == 0) { 0113 goto done; // Nothing to see here, move on. 0114 } 0115 0116 for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) { 0117 if (!mIsActive[i]) { 0118 continue; 0119 } 0120 st = mProbers[i]->HandleData(newBuf1, newLen1); 0121 if (st == eFoundIt) { 0122 mBestGuess = i; 0123 mState = eFoundIt; 0124 break; 0125 } else if (st == eNotMe) { 0126 mIsActive[i] = false; 0127 mActiveNum--; 0128 if (mActiveNum == 0) { 0129 mState = eNotMe; 0130 break; 0131 } 0132 } 0133 } 0134 0135 done: 0136 free(newBuf1); 0137 0138 return mState; 0139 } 0140 0141 float nsSBCSGroupProber::GetConfidence(void) 0142 { 0143 unsigned int i; 0144 float bestConf = 0.0; 0145 float cf; 0146 0147 switch (mState) { 0148 case eFoundIt: 0149 return (float)0.99; // sure yes 0150 case eNotMe: 0151 return (float)0.01; // sure no 0152 default: 0153 for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) { 0154 if (!mIsActive[i]) { 0155 continue; 0156 } 0157 cf = mProbers[i]->GetConfidence(); 0158 if (bestConf < cf) { 0159 bestConf = cf; 0160 mBestGuess = i; 0161 } 0162 } 0163 } 0164 return bestConf; 0165 } 0166 0167 #ifdef DEBUG_PROBE 0168 void nsSBCSGroupProber::DumpStatus() 0169 { 0170 unsigned int i; 0171 float cf; 0172 0173 cf = GetConfidence(); 0174 printf(" SBCS Group Prober --------begin status \r\n"); 0175 for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) { 0176 if (!mIsActive[i]) { 0177 printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); 0178 } else { 0179 mProbers[i]->DumpStatus(); 0180 } 0181 } 0182 printf(" SBCS Group found best match [%s] confidence %f.\r\n", mProbers[mBestGuess]->GetCharSetName(), cf); 0183 } 0184 #endif 0185 }