File indexing completed on 2024-04-28 03:53:02
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "JapaneseGroupProber.h" 0008 0009 #include <stdio.h> 0010 #include <stdlib.h> 0011 0012 namespace kencodingprober 0013 { 0014 #ifdef DEBUG_PROBE 0015 static const char *const ProberName[] = { 0016 "Unicode", 0017 "GB18030", 0018 "Big5", 0019 }; 0020 0021 #endif 0022 0023 JapaneseGroupProber::JapaneseGroupProber() 0024 { 0025 mProbers[0] = new UnicodeGroupProber(); 0026 mProbers[1] = new nsSJISProber(); 0027 mProbers[2] = new nsEUCJPProber(); 0028 Reset(); 0029 } 0030 0031 JapaneseGroupProber::~JapaneseGroupProber() 0032 { 0033 for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) { 0034 delete mProbers[i]; 0035 } 0036 } 0037 0038 const char *JapaneseGroupProber::GetCharSetName() 0039 { 0040 if (mBestGuess == -1) { 0041 GetConfidence(); 0042 if (mBestGuess == -1) { 0043 mBestGuess = 1; // assume it's GB18030 0044 } 0045 } 0046 return mProbers[mBestGuess]->GetCharSetName(); 0047 } 0048 0049 void JapaneseGroupProber::Reset(void) 0050 { 0051 mActiveNum = 0; 0052 for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) { 0053 if (mProbers[i]) { 0054 mProbers[i]->Reset(); 0055 mIsActive[i] = true; 0056 ++mActiveNum; 0057 } else { 0058 mIsActive[i] = false; 0059 } 0060 } 0061 mBestGuess = -1; 0062 mState = eDetecting; 0063 } 0064 0065 nsProbingState JapaneseGroupProber::HandleData(const char *aBuf, unsigned int aLen) 0066 { 0067 nsProbingState st; 0068 unsigned int i; 0069 0070 // do filtering to reduce load to probers 0071 char *highbyteBuf; 0072 char *hptr; 0073 bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise 0074 hptr = highbyteBuf = (char *)malloc(aLen); 0075 if (!hptr) { 0076 return mState; 0077 } 0078 for (i = 0; i < aLen; ++i) { 0079 if (aBuf[i] & 0x80) { 0080 *hptr++ = aBuf[i]; 0081 keepNext = true; 0082 } else { 0083 // if previous is highbyte, keep this even it is a ASCII 0084 if (keepNext) { 0085 *hptr++ = aBuf[i]; 0086 keepNext = false; 0087 } 0088 } 0089 } 0090 0091 for (i = 0; i < JP_NUM_OF_PROBERS; ++i) { 0092 if (!mIsActive[i]) { 0093 continue; 0094 } 0095 st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 0096 if (st == eFoundIt) { 0097 mBestGuess = i; 0098 mState = eFoundIt; 0099 break; 0100 } else if (st == eNotMe) { 0101 mIsActive[i] = false; 0102 --mActiveNum; 0103 if (mActiveNum == 0) { 0104 mState = eNotMe; 0105 break; 0106 } 0107 } 0108 } 0109 0110 free(highbyteBuf); 0111 0112 return mState; 0113 } 0114 0115 float JapaneseGroupProber::GetConfidence(void) 0116 { 0117 unsigned int i; 0118 float bestConf = 0.0; 0119 float cf; 0120 0121 switch (mState) { 0122 case eFoundIt: 0123 return (float)0.99; 0124 case eNotMe: 0125 return (float)0.01; 0126 default: 0127 for (i = 0; i < JP_NUM_OF_PROBERS; ++i) { 0128 if (!mIsActive[i]) { 0129 continue; 0130 } 0131 cf = mProbers[i]->GetConfidence(); 0132 if (bestConf < cf) { 0133 bestConf = cf; 0134 mBestGuess = i; 0135 } 0136 } 0137 } 0138 return bestConf; 0139 } 0140 0141 #ifdef DEBUG_PROBE 0142 void JapaneseGroupProber::DumpStatus() 0143 { 0144 unsigned int i; 0145 float cf; 0146 0147 GetConfidence(); 0148 for (i = 0; i < JP_NUM_OF_PROBERS; i++) { 0149 if (!mIsActive[i]) { 0150 printf(" Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 0151 } else { 0152 cf = mProbers[i]->GetConfidence(); 0153 printf(" Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]); 0154 } 0155 } 0156 } 0157 #endif 0158 }