File indexing completed on 2024-04-28 03:53:03

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #include "nsLatin1Prober.h"
0008 #include <stdio.h>
0009 #include <stdlib.h>
0010 
0011 #define UDF 0 // undefined
0012 #define OTH 1 // other
0013 #define ASC 2 // ascii capital letter
0014 #define ASS 3 // ascii small letter
0015 #define ACV 4 // accent capital vowel
0016 #define ACO 5 // accent capital other
0017 #define ASV 6 // accent small vowel
0018 #define ASO 7 // accent small other
0019 #define CLASS_NUM 8 // total classes
0020 
0021 namespace kencodingprober
0022 {
0023 static const unsigned char Latin1_CharToClass[] = {
0024     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
0025     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
0026     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
0027     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
0028     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
0029     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
0030     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
0031     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
0032     OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
0033     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
0034     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
0035     ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
0036     OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
0037     ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
0038     ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
0039     ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
0040     OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
0041     OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
0042     UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
0043     OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
0044     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
0045     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
0046     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
0047     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
0048     ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
0049     ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
0050     ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
0051     ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
0052     ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
0053     ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
0054     ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
0055     ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
0056 };
0057 
0058 /* 0 : illegal
0059    1 : very unlikely
0060    2 : normal
0061    3 : very likely
0062 */
0063 static const unsigned char Latin1ClassModel[] = {
0064     /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
0065     /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
0066     /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
0067     /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
0068     /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
0069     /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
0070     /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
0071     /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
0072     /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
0073 };
0074 
0075 void nsLatin1Prober::Reset(void)
0076 {
0077     mState = eDetecting;
0078     mLastCharClass = OTH;
0079     for (int i = 0; i < FREQ_CAT_NUM; i++) {
0080         mFreqCounter[i] = 0;
0081     }
0082 }
0083 
0084 nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen)
0085 {
0086     char *newBuf1 = nullptr;
0087     unsigned int newLen1 = 0;
0088 
0089     if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
0090         newBuf1 = (char *)aBuf;
0091         newLen1 = aLen;
0092     }
0093 
0094     for (unsigned int i = 0; i < newLen1; i++) {
0095         const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
0096         const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass];
0097         if (freq == 0) {
0098             mState = eNotMe;
0099             break;
0100         }
0101         mFreqCounter[freq]++;
0102         mLastCharClass = charClass;
0103     }
0104 
0105     if (newBuf1 != aBuf) {
0106         free(newBuf1);
0107     }
0108 
0109     return mState;
0110 }
0111 
0112 float nsLatin1Prober::GetConfidence(void)
0113 {
0114     if (mState == eNotMe) {
0115         return 0.01f;
0116     }
0117 
0118     float confidence;
0119     unsigned int total = 0;
0120     for (int i = 0; i < FREQ_CAT_NUM; i++) {
0121         total += mFreqCounter[i];
0122     }
0123 
0124     if (!total) {
0125         confidence = 0.0f;
0126     } else {
0127         confidence = mFreqCounter[3] * 1.0f / total;
0128         confidence -= mFreqCounter[1] * 20.0f / total;
0129     }
0130 
0131     if (confidence < 0.0f) {
0132         confidence = 0.0f;
0133     }
0134 
0135     // lower the confidence of latin1 so that other more accurate detector
0136     // can take priority.
0137     confidence *= 0.50f;
0138 
0139     return confidence;
0140 }
0141 
0142 #ifdef DEBUG_PROBE
0143 void nsLatin1Prober::DumpStatus()
0144 {
0145     printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
0146 }
0147 #endif
0148 }