File indexing completed on 2024-05-19 07:41:57
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsSBCharSetProber.h" 0008 0009 #include <stdio.h> 0010 0011 namespace kencodingprober 0012 { 0013 nsProbingState nsSingleByteCharSetProber::HandleData(const char *aBuf, unsigned int aLen) 0014 { 0015 for (unsigned int i = 0; i < aLen; i++) { 0016 const unsigned char order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; 0017 0018 if (order < SYMBOL_CAT_ORDER) { 0019 mTotalChar++; 0020 } 0021 if (order < SAMPLE_SIZE) { 0022 mFreqChar++; 0023 0024 if (mLastOrder < SAMPLE_SIZE) { 0025 mTotalSeqs++; 0026 if (!mReversed) { 0027 ++(mSeqCounters[(int)mModel->precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]); 0028 } else { // reverse the order of the letters in the lookup 0029 ++(mSeqCounters[(int)mModel->precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]); 0030 } 0031 } 0032 } 0033 mLastOrder = order; 0034 } 0035 0036 if (mState == eDetecting) { 0037 if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) { 0038 float cf = GetConfidence(); 0039 if (cf > POSITIVE_SHORTCUT_THRESHOLD) { 0040 mState = eFoundIt; 0041 } else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) { 0042 mState = eNotMe; 0043 } 0044 } 0045 } 0046 0047 return mState; 0048 } 0049 0050 void nsSingleByteCharSetProber::Reset(void) 0051 { 0052 mState = eDetecting; 0053 mLastOrder = 255; 0054 for (unsigned int i = 0; i < NUMBER_OF_SEQ_CAT; i++) { 0055 mSeqCounters[i] = 0; 0056 } 0057 mTotalSeqs = 0; 0058 mTotalChar = 0; 0059 mFreqChar = 0; 0060 } 0061 0062 //#define NEGATIVE_APPROACH 1 0063 0064 float nsSingleByteCharSetProber::GetConfidence(void) 0065 { 0066 #ifdef NEGATIVE_APPROACH 0067 if (mTotalSeqs > 0) 0068 if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT] * 10) { 0069 return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT] * 10)) / mTotalSeqs * mFreqChar / mTotalChar; 0070 } 0071 return (float)0.01; 0072 #else // POSITIVE_APPROACH 0073 float r; 0074 0075 if (mTotalSeqs > 0) { 0076 r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; 0077 r = r * mFreqChar / mTotalChar; 0078 if (r >= (float)1.00) { 0079 r = (float)0.99; 0080 } 0081 return r; 0082 } 0083 return (float)0.01; 0084 #endif 0085 } 0086 0087 const char *nsSingleByteCharSetProber::GetCharSetName() 0088 { 0089 if (!mNameProber) { 0090 return mModel->charsetName; 0091 } 0092 return mNameProber->GetCharSetName(); 0093 } 0094 0095 #ifdef DEBUG_PROBE 0096 void nsSingleByteCharSetProber::DumpStatus() 0097 { 0098 printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 0099 } 0100 #endif 0101 }