File indexing completed on 2024-04-28 03:53:04

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #include "nsSBCharSetProber.h"
0008 
0009 #include <stdio.h>
0010 
0011 namespace kencodingprober
0012 {
0013 nsProbingState nsSingleByteCharSetProber::HandleData(const char *aBuf, unsigned int aLen)
0014 {
0015     for (unsigned int i = 0; i < aLen; i++) {
0016         const unsigned char order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
0017 
0018         if (order < SYMBOL_CAT_ORDER) {
0019             mTotalChar++;
0020         }
0021         if (order < SAMPLE_SIZE) {
0022             mFreqChar++;
0023 
0024             if (mLastOrder < SAMPLE_SIZE) {
0025                 mTotalSeqs++;
0026                 if (!mReversed) {
0027                     ++(mSeqCounters[(int)mModel->precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]);
0028                 } else { // reverse the order of the letters in the lookup
0029                     ++(mSeqCounters[(int)mModel->precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]);
0030                 }
0031             }
0032         }
0033         mLastOrder = order;
0034     }
0035 
0036     if (mState == eDetecting) {
0037         if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) {
0038             float cf = GetConfidence();
0039             if (cf > POSITIVE_SHORTCUT_THRESHOLD) {
0040                 mState = eFoundIt;
0041             } else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) {
0042                 mState = eNotMe;
0043             }
0044         }
0045     }
0046 
0047     return mState;
0048 }
0049 
0050 void nsSingleByteCharSetProber::Reset(void)
0051 {
0052     mState = eDetecting;
0053     mLastOrder = 255;
0054     for (unsigned int i = 0; i < NUMBER_OF_SEQ_CAT; i++) {
0055         mSeqCounters[i] = 0;
0056     }
0057     mTotalSeqs = 0;
0058     mTotalChar = 0;
0059     mFreqChar = 0;
0060 }
0061 
0062 //#define NEGATIVE_APPROACH 1
0063 
0064 float nsSingleByteCharSetProber::GetConfidence(void)
0065 {
0066 #ifdef NEGATIVE_APPROACH
0067     if (mTotalSeqs > 0)
0068         if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT] * 10) {
0069             return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT] * 10)) / mTotalSeqs * mFreqChar / mTotalChar;
0070         }
0071     return (float)0.01;
0072 #else // POSITIVE_APPROACH
0073     float r;
0074 
0075     if (mTotalSeqs > 0) {
0076         r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
0077         r = r * mFreqChar / mTotalChar;
0078         if (r >= (float)1.00) {
0079             r = (float)0.99;
0080         }
0081         return r;
0082     }
0083     return (float)0.01;
0084 #endif
0085 }
0086 
0087 const char *nsSingleByteCharSetProber::GetCharSetName()
0088 {
0089     if (!mNameProber) {
0090         return mModel->charsetName;
0091     }
0092     return mNameProber->GetCharSetName();
0093 }
0094 
0095 #ifdef DEBUG_PROBE
0096 void nsSingleByteCharSetProber::DumpStatus()
0097 {
0098     printf("  SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
0099 }
0100 #endif
0101 }