File indexing completed on 2024-04-28 03:53:02

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #include "ChineseGroupProber.h"
0008 
0009 #include "UnicodeGroupProber.h"
0010 #include "nsBig5Prober.h"
0011 #include "nsGB2312Prober.h"
0012 
0013 #include <stdio.h>
0014 #include <stdlib.h>
0015 
0016 namespace kencodingprober
0017 {
0018 #ifdef DEBUG_PROBE
0019 static const char *const ProberName[] = {
0020     "Unicode",
0021     "GB18030",
0022     "Big5",
0023 };
0024 
0025 #endif
0026 
0027 ChineseGroupProber::ChineseGroupProber()
0028 {
0029     mProbers[0] = new UnicodeGroupProber();
0030     mProbers[1] = new nsGB18030Prober();
0031     mProbers[2] = new nsBig5Prober();
0032     Reset();
0033 }
0034 
0035 ChineseGroupProber::~ChineseGroupProber()
0036 {
0037     for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) {
0038         delete mProbers[i];
0039     }
0040 }
0041 
0042 const char *ChineseGroupProber::GetCharSetName()
0043 {
0044     if (mBestGuess == -1) {
0045         GetConfidence();
0046         if (mBestGuess == -1) {
0047             mBestGuess = 1; // assume it's GB18030
0048         }
0049     }
0050     return mProbers[mBestGuess]->GetCharSetName();
0051 }
0052 
0053 void ChineseGroupProber::Reset(void)
0054 {
0055     mActiveNum = 0;
0056     for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) {
0057         if (mProbers[i]) {
0058             mProbers[i]->Reset();
0059             mIsActive[i] = true;
0060             ++mActiveNum;
0061         } else {
0062             mIsActive[i] = false;
0063         }
0064     }
0065     mBestGuess = -1;
0066     mState = eDetecting;
0067 }
0068 
0069 nsProbingState ChineseGroupProber::HandleData(const char *aBuf, unsigned int aLen)
0070 {
0071     nsProbingState st;
0072     unsigned int i;
0073 
0074     // do filtering to reduce load to probers
0075     char *highbyteBuf;
0076     char *hptr;
0077     bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise
0078     hptr = highbyteBuf = (char *)malloc(aLen);
0079     if (!hptr) {
0080         return mState;
0081     }
0082     for (i = 0; i < aLen; ++i) {
0083         if (aBuf[i] & 0x80) {
0084             *hptr++ = aBuf[i];
0085             keepNext = true;
0086         } else {
0087             // if previous is highbyte, keep this even it is an ASCII
0088             if (keepNext) {
0089                 *hptr++ = aBuf[i];
0090                 keepNext = false;
0091             }
0092         }
0093     }
0094 
0095     for (i = 0; i < CN_NUM_OF_PROBERS; ++i) {
0096         if (!mIsActive[i]) {
0097             continue;
0098         }
0099         st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
0100         if (st == eFoundIt) {
0101             mBestGuess = i;
0102             mState = eFoundIt;
0103             break;
0104         } else if (st == eNotMe) {
0105             mIsActive[i] = false;
0106             --mActiveNum;
0107             if (mActiveNum == 0) {
0108                 mState = eNotMe;
0109                 break;
0110             }
0111         }
0112     }
0113 
0114     free(highbyteBuf);
0115 
0116     return mState;
0117 }
0118 
0119 float ChineseGroupProber::GetConfidence(void)
0120 {
0121     unsigned int i;
0122     float bestConf = 0.0;
0123     float cf;
0124 
0125     switch (mState) {
0126     case eFoundIt:
0127         return (float)0.99;
0128     case eNotMe:
0129         return (float)0.01;
0130     default:
0131         for (i = 0; i < CN_NUM_OF_PROBERS; ++i) {
0132             if (!mIsActive[i]) {
0133                 continue;
0134             }
0135             cf = mProbers[i]->GetConfidence();
0136             if (bestConf < cf) {
0137                 bestConf = cf;
0138                 mBestGuess = i;
0139             }
0140         }
0141     }
0142     return bestConf;
0143 }
0144 
0145 #ifdef DEBUG_PROBE
0146 void ChineseGroupProber::DumpStatus()
0147 {
0148     unsigned int i;
0149     float cf;
0150 
0151     GetConfidence();
0152     for (i = 0; i < CN_NUM_OF_PROBERS; i++) {
0153         if (!mIsActive[i]) {
0154             printf("  Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
0155         } else {
0156             cf = mProbers[i]->GetConfidence();
0157             printf("  Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]);
0158         }
0159     }
0160 }
0161 #endif
0162 }