File indexing completed on 2024-04-28 03:53:02
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #ifndef CharDistribution_h__ 0008 #define CharDistribution_h__ 0009 0010 #include "kcodecs_export.h" 0011 0012 #include <qcompilerdetection.h> 0013 0014 #define ENOUGH_DATA_THRESHOLD 256 0015 0016 namespace kencodingprober 0017 { 0018 class KCODECS_NO_EXPORT CharDistributionAnalysis 0019 { 0020 public: 0021 CharDistributionAnalysis() 0022 { 0023 Reset(); 0024 } 0025 virtual ~CharDistributionAnalysis() 0026 { 0027 } 0028 0029 // feed a block of data and do distribution analysis 0030 void HandleData(const char * /* aBuf */, unsigned int /* aLen */) 0031 { 0032 } 0033 0034 // Feed a character with known length 0035 void HandleOneChar(const char *aStr, unsigned int aCharLen) 0036 { 0037 int order; 0038 0039 // we only care about 2-bytes character in our distribution analysis 0040 order = (aCharLen == 2) ? GetOrder(aStr) : -1; 0041 0042 if (order >= 0) { 0043 mTotalChars++; 0044 // order is valid 0045 if ((unsigned int)order < mTableSize) { 0046 if (512 > mCharToFreqOrder[order]) { 0047 mFreqChars++; 0048 } 0049 } 0050 } 0051 } 0052 0053 // return confidence base on existing data 0054 float GetConfidence(); 0055 0056 // Reset analyser, clear any state 0057 void Reset(void) 0058 { 0059 mDone = false; 0060 mTotalChars = 0; 0061 mFreqChars = 0; 0062 } 0063 0064 // This function is for future extension. Caller can use this function to control 0065 // analyser's behavior 0066 void SetOpion() 0067 { 0068 } 0069 0070 // It is not necessary to receive all data to draw conclusion. For charset detection, 0071 // certain amount of data is enough 0072 bool GotEnoughData() 0073 { 0074 return mTotalChars > ENOUGH_DATA_THRESHOLD; 0075 } 0076 0077 protected: 0078 // we do not handle character base on its original encoding string, but 0079 // convert this encoding string to a number, here called order. 0080 // This allows multiple encodings of a language to share one frequency table 0081 virtual int GetOrder(const char * /* str */) 0082 { 0083 return -1; 0084 } 0085 0086 // If this flag is set to true, detection is done and conclusion has been made 0087 bool mDone; 0088 0089 // The number of characters whose frequency order is less than 512 0090 unsigned int mFreqChars; 0091 0092 // Total character encountered. 0093 unsigned int mTotalChars; 0094 0095 // Mapping table to get frequency order from char order (get from GetOrder()) 0096 const short *mCharToFreqOrder; 0097 0098 // Size of above table 0099 unsigned int mTableSize; 0100 0101 // This is a constant value varies from language to language, it is used in 0102 // calculating confidence. See my paper for further detail. 0103 float mTypicalDistributionRatio; 0104 }; 0105 0106 class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis 0107 { 0108 public: 0109 EUCKRDistributionAnalysis(); 0110 0111 protected: 0112 // for euc-KR encoding, we are interested 0113 // first byte range: 0xb0 -- 0xfe 0114 // second byte range: 0xa1 -- 0xfe 0115 // no validation needed here. State machine has done that 0116 int GetOrder(const char *str) override 0117 { 0118 if ((unsigned char)*str >= (unsigned char)0xb0) { 0119 return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 0120 } else { 0121 return -1; 0122 } 0123 } 0124 }; 0125 0126 class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis 0127 { 0128 public: 0129 GB2312DistributionAnalysis(); 0130 0131 protected: 0132 // for GB2312 encoding, we are interested 0133 // first byte range: 0xb0 -- 0xfe 0134 // second byte range: 0xa1 -- 0xfe 0135 // no validation needed here. State machine has done that 0136 int GetOrder(const char *str) override 0137 { 0138 if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) { 0139 return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 0140 } else { 0141 return -1; 0142 } 0143 } 0144 }; 0145 0146 class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis 0147 { 0148 public: 0149 Big5DistributionAnalysis(); 0150 0151 protected: 0152 // for big5 encoding, we are interested 0153 // first byte range: 0xa4 -- 0xfe 0154 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 0155 // no validation needed here. State machine has done that 0156 int GetOrder(const char *str) override 0157 { 0158 if ((unsigned char)*str >= (unsigned char)0xa4) 0159 if ((unsigned char)str[1] >= (unsigned char)0xa1) { 0160 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63; 0161 } else { 0162 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 0163 } 0164 else { 0165 return -1; 0166 } 0167 } 0168 }; 0169 0170 class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis 0171 { 0172 public: 0173 SJISDistributionAnalysis(); 0174 0175 protected: 0176 // for sjis encoding, we are interested 0177 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 0178 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 0179 // no validation needed here. State machine has done that 0180 int GetOrder(const char *str) override 0181 { 0182 int order; 0183 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) { 0184 order = 188 * ((unsigned char)str[0] - (unsigned char)0x81); 0185 } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) { 0186 order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31); 0187 } else { 0188 return -1; 0189 } 0190 order += (unsigned char)*(str + 1) - 0x40; 0191 if ((unsigned char)str[1] > (unsigned char)0x7f) { 0192 order--; 0193 } 0194 return order; 0195 } 0196 }; 0197 0198 class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis 0199 { 0200 public: 0201 EUCJPDistributionAnalysis(); 0202 0203 protected: 0204 // for euc-JP encoding, we are interested 0205 // first byte range: 0xa0 -- 0xfe 0206 // second byte range: 0xa1 -- 0xfe 0207 // no validation needed here. State machine has done that 0208 int GetOrder(const char *str) override 0209 { 0210 if ((unsigned char)*str >= (unsigned char)0xa0) { 0211 return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 0212 } else { 0213 return -1; 0214 } 0215 } 0216 }; 0217 } 0218 #endif // CharDistribution_h__