File indexing completed on 2024-05-05 16:05:50
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 // for japanese encoding, observe characteristic: 0008 // 1, kana character (or hankaku?) often have high frequency of appearance 0009 // 2, kana character often exist in group 0010 // 3, certain combination of kana is never used in japanese language 0011 0012 #include "nsEUCJPProber.h" 0013 0014 namespace kencodingprober 0015 { 0016 void nsEUCJPProber::Reset(void) 0017 { 0018 mCodingSM->Reset(); 0019 mState = eDetecting; 0020 mContextAnalyser.Reset(); 0021 mDistributionAnalyser.Reset(); 0022 } 0023 0024 nsProbingState nsEUCJPProber::HandleData(const char *aBuf, unsigned int aLen) 0025 { 0026 if (aLen == 0) { 0027 return mState; 0028 } 0029 0030 for (unsigned int i = 0; i < aLen; i++) { 0031 const nsSMState codingState = mCodingSM->NextState(aBuf[i]); 0032 if (codingState == eError) { 0033 mState = eNotMe; 0034 break; 0035 } 0036 if (codingState == eItsMe) { 0037 mState = eFoundIt; 0038 break; 0039 } 0040 if (codingState == eStart) { 0041 unsigned int charLen = mCodingSM->GetCurrentCharLen(); 0042 0043 if (i == 0) { 0044 mLastChar[1] = aBuf[0]; 0045 mContextAnalyser.HandleOneChar(mLastChar, charLen); 0046 mDistributionAnalyser.HandleOneChar(mLastChar, charLen); 0047 } else { 0048 mContextAnalyser.HandleOneChar(aBuf + i - 1, charLen); 0049 mDistributionAnalyser.HandleOneChar(aBuf + i - 1, charLen); 0050 } 0051 } 0052 } 0053 0054 mLastChar[0] = aBuf[aLen - 1]; 0055 0056 if (mState == eDetecting) { 0057 if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) { 0058 mState = eFoundIt; 0059 } 0060 } 0061 0062 return mState; 0063 } 0064 0065 float nsEUCJPProber::GetConfidence(void) 0066 { 0067 float contxtCf = mContextAnalyser.GetConfidence(); 0068 float distribCf = mDistributionAnalyser.GetConfidence(); 0069 0070 return (contxtCf > distribCf ? contxtCf : distribCf); 0071 } 0072 }