File indexing completed on 2024-04-28 03:53:04

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #include "nsSBCSGroupProber.h"
0008 
0009 #include "UnicodeGroupProber.h"
0010 #include "nsHebrewProber.h"
0011 #include "nsSBCharSetProber.h"
0012 
0013 #include <stdio.h>
0014 #include <stdlib.h>
0015 
0016 namespace kencodingprober
0017 {
0018 nsSBCSGroupProber::nsSBCSGroupProber()
0019 {
0020     mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
0021     mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
0022     mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
0023     mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
0024     mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
0025     mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
0026     mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
0027     mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
0028     mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
0029     mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
0030 
0031     nsHebrewProber *hebprober = new nsHebrewProber();
0032     // Notice: Any change in these indexes - 10,11,12 must be reflected
0033     // in the code below as well.
0034     mProbers[10] = hebprober;
0035     mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew
0036     mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew
0037     mProbers[13] = new UnicodeGroupProber();
0038 
0039     // Tell the Hebrew prober about the logical and visual probers
0040     if (mProbers[10] && mProbers[11] && mProbers[12]) { // all are not null
0041         hebprober->SetModelProbers(mProbers[11], mProbers[12]);
0042     } else { // One or more is null. avoid any Hebrew probing, null them all
0043         for (unsigned int i = 10; i <= 12; ++i) {
0044             delete mProbers[i];
0045             mProbers[i] = nullptr;
0046         }
0047     }
0048 
0049     // disable latin2 before latin1 is available, otherwise all latin1
0050     // will be detected as latin2 because of their similarity.
0051     // mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
0052     // mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
0053 
0054     Reset();
0055 }
0056 
0057 nsSBCSGroupProber::~nsSBCSGroupProber()
0058 {
0059     for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) {
0060         delete mProbers[i];
0061     }
0062 }
0063 
0064 const char *nsSBCSGroupProber::GetCharSetName()
0065 {
0066     // if we have no answer yet
0067     if (mBestGuess == -1) {
0068         GetConfidence();
0069         // no charset seems positive
0070         if (mBestGuess == -1)
0071         // we will use default.
0072         {
0073             mBestGuess = 0;
0074         }
0075     }
0076     return mProbers[mBestGuess]->GetCharSetName();
0077 }
0078 
0079 void nsSBCSGroupProber::Reset(void)
0080 {
0081     mActiveNum = 0;
0082     for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) {
0083         if (mProbers[i]) { // not null
0084             mProbers[i]->Reset();
0085             mIsActive[i] = true;
0086             ++mActiveNum;
0087         } else {
0088             mIsActive[i] = false;
0089         }
0090     }
0091     mBestGuess = -1;
0092     mState = eDetecting;
0093 }
0094 
0095 nsProbingState nsSBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen)
0096 {
0097     nsProbingState st;
0098     unsigned int i;
0099     char *newBuf1 = nullptr;
0100     unsigned int newLen1 = 0;
0101 
0102     // apply filter to original buffer, and we got new buffer back
0103     // depend on what script it is, we will feed them the new buffer
0104     // we got after applying proper filter
0105     // this is done without any consideration to KeepEnglishLetters
0106     // of each prober since as of now, there are no probers here which
0107     // recognize languages with English characters.
0108     if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
0109         goto done;
0110     }
0111 
0112     if (newLen1 == 0) {
0113         goto done; // Nothing to see here, move on.
0114     }
0115 
0116     for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) {
0117         if (!mIsActive[i]) {
0118             continue;
0119         }
0120         st = mProbers[i]->HandleData(newBuf1, newLen1);
0121         if (st == eFoundIt) {
0122             mBestGuess = i;
0123             mState = eFoundIt;
0124             break;
0125         } else if (st == eNotMe) {
0126             mIsActive[i] = false;
0127             mActiveNum--;
0128             if (mActiveNum == 0) {
0129                 mState = eNotMe;
0130                 break;
0131             }
0132         }
0133     }
0134 
0135 done:
0136     free(newBuf1);
0137 
0138     return mState;
0139 }
0140 
0141 float nsSBCSGroupProber::GetConfidence(void)
0142 {
0143     unsigned int i;
0144     float bestConf = 0.0;
0145     float cf;
0146 
0147     switch (mState) {
0148     case eFoundIt:
0149         return (float)0.99; // sure yes
0150     case eNotMe:
0151         return (float)0.01; // sure no
0152     default:
0153         for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) {
0154             if (!mIsActive[i]) {
0155                 continue;
0156             }
0157             cf = mProbers[i]->GetConfidence();
0158             if (bestConf < cf) {
0159                 bestConf = cf;
0160                 mBestGuess = i;
0161             }
0162         }
0163     }
0164     return bestConf;
0165 }
0166 
0167 #ifdef DEBUG_PROBE
0168 void nsSBCSGroupProber::DumpStatus()
0169 {
0170     unsigned int i;
0171     float cf;
0172 
0173     cf = GetConfidence();
0174     printf(" SBCS Group Prober --------begin status \r\n");
0175     for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) {
0176         if (!mIsActive[i]) {
0177             printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
0178         } else {
0179             mProbers[i]->DumpStatus();
0180         }
0181     }
0182     printf(" SBCS Group found best match [%s] confidence %f.\r\n", mProbers[mBestGuess]->GetCharSetName(), cf);
0183 }
0184 #endif
0185 }