File indexing completed on 2024-04-28 03:53:02
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "ChineseGroupProber.h" 0008 0009 #include "UnicodeGroupProber.h" 0010 #include "nsBig5Prober.h" 0011 #include "nsGB2312Prober.h" 0012 0013 #include <stdio.h> 0014 #include <stdlib.h> 0015 0016 namespace kencodingprober 0017 { 0018 #ifdef DEBUG_PROBE 0019 static const char *const ProberName[] = { 0020 "Unicode", 0021 "GB18030", 0022 "Big5", 0023 }; 0024 0025 #endif 0026 0027 ChineseGroupProber::ChineseGroupProber() 0028 { 0029 mProbers[0] = new UnicodeGroupProber(); 0030 mProbers[1] = new nsGB18030Prober(); 0031 mProbers[2] = new nsBig5Prober(); 0032 Reset(); 0033 } 0034 0035 ChineseGroupProber::~ChineseGroupProber() 0036 { 0037 for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) { 0038 delete mProbers[i]; 0039 } 0040 } 0041 0042 const char *ChineseGroupProber::GetCharSetName() 0043 { 0044 if (mBestGuess == -1) { 0045 GetConfidence(); 0046 if (mBestGuess == -1) { 0047 mBestGuess = 1; // assume it's GB18030 0048 } 0049 } 0050 return mProbers[mBestGuess]->GetCharSetName(); 0051 } 0052 0053 void ChineseGroupProber::Reset(void) 0054 { 0055 mActiveNum = 0; 0056 for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) { 0057 if (mProbers[i]) { 0058 mProbers[i]->Reset(); 0059 mIsActive[i] = true; 0060 ++mActiveNum; 0061 } else { 0062 mIsActive[i] = false; 0063 } 0064 } 0065 mBestGuess = -1; 0066 mState = eDetecting; 0067 } 0068 0069 nsProbingState ChineseGroupProber::HandleData(const char *aBuf, unsigned int aLen) 0070 { 0071 nsProbingState st; 0072 unsigned int i; 0073 0074 // do filtering to reduce load to probers 0075 char *highbyteBuf; 0076 char *hptr; 0077 bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise 0078 hptr = highbyteBuf = (char *)malloc(aLen); 0079 if (!hptr) { 0080 return mState; 0081 } 0082 for (i = 0; i < aLen; ++i) { 0083 if (aBuf[i] & 0x80) { 0084 *hptr++ = aBuf[i]; 0085 keepNext = true; 0086 } else { 0087 // if previous is highbyte, keep this even it is an ASCII 0088 if (keepNext) { 0089 *hptr++ = aBuf[i]; 0090 keepNext = false; 0091 } 0092 } 0093 } 0094 0095 for (i = 0; i < CN_NUM_OF_PROBERS; ++i) { 0096 if (!mIsActive[i]) { 0097 continue; 0098 } 0099 st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 0100 if (st == eFoundIt) { 0101 mBestGuess = i; 0102 mState = eFoundIt; 0103 break; 0104 } else if (st == eNotMe) { 0105 mIsActive[i] = false; 0106 --mActiveNum; 0107 if (mActiveNum == 0) { 0108 mState = eNotMe; 0109 break; 0110 } 0111 } 0112 } 0113 0114 free(highbyteBuf); 0115 0116 return mState; 0117 } 0118 0119 float ChineseGroupProber::GetConfidence(void) 0120 { 0121 unsigned int i; 0122 float bestConf = 0.0; 0123 float cf; 0124 0125 switch (mState) { 0126 case eFoundIt: 0127 return (float)0.99; 0128 case eNotMe: 0129 return (float)0.01; 0130 default: 0131 for (i = 0; i < CN_NUM_OF_PROBERS; ++i) { 0132 if (!mIsActive[i]) { 0133 continue; 0134 } 0135 cf = mProbers[i]->GetConfidence(); 0136 if (bestConf < cf) { 0137 bestConf = cf; 0138 mBestGuess = i; 0139 } 0140 } 0141 } 0142 return bestConf; 0143 } 0144 0145 #ifdef DEBUG_PROBE 0146 void ChineseGroupProber::DumpStatus() 0147 { 0148 unsigned int i; 0149 float cf; 0150 0151 GetConfidence(); 0152 for (i = 0; i < CN_NUM_OF_PROBERS; i++) { 0153 if (!mIsActive[i]) { 0154 printf(" Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 0155 } else { 0156 cf = mProbers[i]->GetConfidence(); 0157 printf(" Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]); 0158 } 0159 } 0160 } 0161 #endif 0162 }