File indexing completed on 2024-04-28 03:53:03
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsLatin1Prober.h" 0008 #include <stdio.h> 0009 #include <stdlib.h> 0010 0011 #define UDF 0 // undefined 0012 #define OTH 1 // other 0013 #define ASC 2 // ascii capital letter 0014 #define ASS 3 // ascii small letter 0015 #define ACV 4 // accent capital vowel 0016 #define ACO 5 // accent capital other 0017 #define ASV 6 // accent small vowel 0018 #define ASO 7 // accent small other 0019 #define CLASS_NUM 8 // total classes 0020 0021 namespace kencodingprober 0022 { 0023 static const unsigned char Latin1_CharToClass[] = { 0024 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 0025 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 0026 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 0027 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 0028 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 0029 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 0030 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 0031 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 0032 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 0033 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 0034 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 0035 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 0036 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 0037 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 0038 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 0039 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 0040 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 0041 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 0042 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 0043 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 0044 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 0045 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 0046 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 0047 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 0048 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 0049 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 0050 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 0051 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 0052 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 0053 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 0054 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 0055 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 0056 }; 0057 0058 /* 0 : illegal 0059 1 : very unlikely 0060 2 : normal 0061 3 : very likely 0062 */ 0063 static const unsigned char Latin1ClassModel[] = { 0064 /* UDF OTH ASC ASS ACV ACO ASV ASO */ 0065 /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 0066 /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 0067 /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 0068 /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 0069 /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 0070 /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 0071 /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 0072 /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 0073 }; 0074 0075 void nsLatin1Prober::Reset(void) 0076 { 0077 mState = eDetecting; 0078 mLastCharClass = OTH; 0079 for (int i = 0; i < FREQ_CAT_NUM; i++) { 0080 mFreqCounter[i] = 0; 0081 } 0082 } 0083 0084 nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen) 0085 { 0086 char *newBuf1 = nullptr; 0087 unsigned int newLen1 = 0; 0088 0089 if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 0090 newBuf1 = (char *)aBuf; 0091 newLen1 = aLen; 0092 } 0093 0094 for (unsigned int i = 0; i < newLen1; i++) { 0095 const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 0096 const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass]; 0097 if (freq == 0) { 0098 mState = eNotMe; 0099 break; 0100 } 0101 mFreqCounter[freq]++; 0102 mLastCharClass = charClass; 0103 } 0104 0105 if (newBuf1 != aBuf) { 0106 free(newBuf1); 0107 } 0108 0109 return mState; 0110 } 0111 0112 float nsLatin1Prober::GetConfidence(void) 0113 { 0114 if (mState == eNotMe) { 0115 return 0.01f; 0116 } 0117 0118 float confidence; 0119 unsigned int total = 0; 0120 for (int i = 0; i < FREQ_CAT_NUM; i++) { 0121 total += mFreqCounter[i]; 0122 } 0123 0124 if (!total) { 0125 confidence = 0.0f; 0126 } else { 0127 confidence = mFreqCounter[3] * 1.0f / total; 0128 confidence -= mFreqCounter[1] * 20.0f / total; 0129 } 0130 0131 if (confidence < 0.0f) { 0132 confidence = 0.0f; 0133 } 0134 0135 // lower the confidence of latin1 so that other more accurate detector 0136 // can take priority. 0137 confidence *= 0.50f; 0138 0139 return confidence; 0140 } 0141 0142 #ifdef DEBUG_PROBE 0143 void nsLatin1Prober::DumpStatus() 0144 { 0145 printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 0146 } 0147 #endif 0148 }