File indexing completed on 2024-12-22 04:33:40
0001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 0002 /* ***** BEGIN LICENSE BLOCK ***** 0003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 0004 * 0005 * The contents of this file are subject to the Mozilla Public License Version 0006 * 1.1 (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * http://www.mozilla.org/MPL/ 0009 * 0010 * Software distributed under the License is distributed on an "AS IS" basis, 0011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 0012 * for the specific language governing rights and limitations under the 0013 * License. 0014 * 0015 * The Original Code is Mozilla Communicator client code. 0016 * 0017 * The Initial Developer of the Original Code is 0018 * Netscape Communications Corporation. 0019 * Portions created by the Initial Developer are Copyright (C) 1998 0020 * the Initial Developer. All Rights Reserved. 0021 * 0022 * Contributor(s): 0023 * 0024 * Alternatively, the contents of this file may be used under the terms of 0025 * either the GNU General Public License Version 2 or later (the "GPL"), or 0026 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 0027 * in which case the provisions of the GPL or the LGPL are applicable instead 0028 * of those above. If you wish to allow use of your version of this file only 0029 * under the terms of either the GPL or the LGPL, and not to allow others to 0030 * use your version of this file under the terms of the MPL, indicate your 0031 * decision by deleting the provisions above and replace them with the notice 0032 * and other provisions required by the GPL or the LGPL. If you do not delete 0033 * the provisions above, a recipient may use your version of this file under 0034 * the terms of any one of the MPL, the GPL or the LGPL. 0035 * 0036 * ***** END LICENSE BLOCK ***** */ 0037 0038 #ifndef CharDistribution_h__ 0039 #define CharDistribution_h__ 0040 0041 #include "nscore.h" 0042 0043 #define ENOUGH_DATA_THRESHOLD 1024 0044 0045 class CharDistributionAnalysis 0046 { 0047 public: 0048 CharDistributionAnalysis() : mCharToFreqOrder(0), mTableSize(0) 0049 , mTypicalDistributionRatio(0.0) {Reset();} 0050 virtual ~CharDistributionAnalysis() {} 0051 0052 //feed a block of data and do distribution analysis 0053 void HandleData(const char* aBuf, PRUint32 aLen) {(void)aBuf; (void)aLen;} 0054 0055 //Feed a character with known length 0056 void HandleOneChar(const char* aStr, PRUint32 aCharLen) 0057 { 0058 PRInt32 order; 0059 0060 //we only care about 2-bytes character in our distribution analysis 0061 order = (aCharLen == 2) ? GetOrder(aStr) : -1; 0062 0063 if (order >= 0) 0064 { 0065 mTotalChars++; 0066 //order is valid 0067 if ((PRUint32)order < mTableSize) 0068 { 0069 if (512 > mCharToFreqOrder[order]) 0070 mFreqChars++; 0071 } 0072 } 0073 }; 0074 0075 //return confidence base on existing data 0076 float GetConfidence(); 0077 0078 //Reset analyser, clear any state 0079 void Reset(void) 0080 { 0081 mDone = PR_FALSE; 0082 mTotalChars = 0; 0083 mFreqChars = 0; 0084 }; 0085 0086 //This function is for future extension. Caller can use this function to control 0087 //analyser's behavior 0088 void SetOpion(){} 0089 0090 //It is not necessary to receive all data to draw conclusion. For charset detection, 0091 // certain amount of data is enough 0092 PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}; 0093 0094 protected: 0095 //we do not handle character base on its original encoding string, but 0096 //convert this encoding string to a number, here called order. 0097 //This allow multiple encoding of a language to share one frequency table 0098 virtual PRInt32 GetOrder(const char* str) {(void)str; return -1;}; 0099 0100 //If this flag is set to PR_TRUE, detection is done and conclusion has been made 0101 PRBool mDone; 0102 0103 //The number of characters whose frequency order is less than 512 0104 PRUint32 mFreqChars; 0105 0106 //Total character encounted. 0107 PRUint32 mTotalChars; 0108 0109 //Mapping table to get frequency order from char order (get from GetOrder()) 0110 const PRInt16 *mCharToFreqOrder; 0111 0112 //Size of above table 0113 PRUint32 mTableSize; 0114 0115 //This is a constant value varies from language to language, it is used in 0116 //calculating confidence. See my paper for further detail. 0117 float mTypicalDistributionRatio; 0118 }; 0119 0120 0121 class EUCTWDistributionAnalysis: public CharDistributionAnalysis 0122 { 0123 public: 0124 EUCTWDistributionAnalysis(); 0125 protected: 0126 0127 //for euc-TW encoding, we are interested 0128 // first byte range: 0xc4 -- 0xfe 0129 // second byte range: 0xa1 -- 0xfe 0130 //no validation needed here. State machine has done that 0131 PRInt32 GetOrder(const char* str) 0132 { if ((unsigned char)*str >= (unsigned char)0xc4) 0133 return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; 0134 else 0135 return -1; 0136 }; 0137 }; 0138 0139 0140 class EUCKRDistributionAnalysis : public CharDistributionAnalysis 0141 { 0142 public: 0143 EUCKRDistributionAnalysis(); 0144 protected: 0145 //for euc-KR encoding, we are interested 0146 // first byte range: 0xb0 -- 0xfe 0147 // second byte range: 0xa1 -- 0xfe 0148 //no validation needed here. State machine has done that 0149 PRInt32 GetOrder(const char* str) 0150 { if ((unsigned char)*str >= (unsigned char)0xb0) 0151 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 0152 else 0153 return -1; 0154 }; 0155 }; 0156 0157 class GB2312DistributionAnalysis : public CharDistributionAnalysis 0158 { 0159 public: 0160 GB2312DistributionAnalysis(); 0161 protected: 0162 //for GB2312 encoding, we are interested 0163 // first byte range: 0xb0 -- 0xfe 0164 // second byte range: 0xa1 -- 0xfe 0165 //no validation needed here. State machine has done that 0166 PRInt32 GetOrder(const char* str) 0167 { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) 0168 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 0169 else 0170 return -1; 0171 }; 0172 }; 0173 0174 0175 class Big5DistributionAnalysis : public CharDistributionAnalysis 0176 { 0177 public: 0178 Big5DistributionAnalysis(); 0179 protected: 0180 //for big5 encoding, we are interested 0181 // first byte range: 0xa4 -- 0xfe 0182 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 0183 //no validation needed here. State machine has done that 0184 PRInt32 GetOrder(const char* str) 0185 { if ((unsigned char)*str >= (unsigned char)0xa4) 0186 if ((unsigned char)str[1] >= (unsigned char)0xa1) 0187 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; 0188 else 0189 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 0190 else 0191 return -1; 0192 }; 0193 }; 0194 0195 class SJISDistributionAnalysis : public CharDistributionAnalysis 0196 { 0197 public: 0198 SJISDistributionAnalysis(); 0199 protected: 0200 //for sjis encoding, we are interested 0201 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 0202 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 0203 //no validation needed here. State machine has done that 0204 PRInt32 GetOrder(const char* str) 0205 { 0206 PRInt32 order; 0207 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) 0208 order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); 0209 else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) 0210 order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); 0211 else 0212 return -1; 0213 order += (unsigned char)*(str+1) - 0x40; 0214 if ((unsigned char)str[1] > (unsigned char)0x7f) 0215 order--; 0216 return order; 0217 }; 0218 }; 0219 0220 class EUCJPDistributionAnalysis : public CharDistributionAnalysis 0221 { 0222 public: 0223 EUCJPDistributionAnalysis(); 0224 protected: 0225 //for euc-JP encoding, we are interested 0226 // first byte range: 0xa0 -- 0xfe 0227 // second byte range: 0xa1 -- 0xfe 0228 //no validation needed here. State machine has done that 0229 PRInt32 GetOrder(const char* str) 0230 { if ((unsigned char)*str >= (unsigned char)0xa0) 0231 return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 0232 else 0233 return -1; 0234 }; 0235 }; 0236 0237 #endif //CharDistribution_h__ 0238