File indexing completed on 2024-12-22 04:33:40

0001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
0002 /* ***** BEGIN LICENSE BLOCK *****
0003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
0004  *
0005  * The contents of this file are subject to the Mozilla Public License Version
0006  * 1.1 (the "License"); you may not use this file except in compliance with
0007  * the License. You may obtain a copy of the License at
0008  * http://www.mozilla.org/MPL/
0009  *
0010  * Software distributed under the License is distributed on an "AS IS" basis,
0011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
0012  * for the specific language governing rights and limitations under the
0013  * License.
0014  *
0015  * The Original Code is Mozilla Communicator client code.
0016  *
0017  * The Initial Developer of the Original Code is
0018  * Netscape Communications Corporation.
0019  * Portions created by the Initial Developer are Copyright (C) 1998
0020  * the Initial Developer. All Rights Reserved.
0021  *
0022  * Contributor(s):
0023  *
0024  * Alternatively, the contents of this file may be used under the terms of
0025  * either the GNU General Public License Version 2 or later (the "GPL"), or
0026  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
0027  * in which case the provisions of the GPL or the LGPL are applicable instead
0028  * of those above. If you wish to allow use of your version of this file only
0029  * under the terms of either the GPL or the LGPL, and not to allow others to
0030  * use your version of this file under the terms of the MPL, indicate your
0031  * decision by deleting the provisions above and replace them with the notice
0032  * and other provisions required by the GPL or the LGPL. If you do not delete
0033  * the provisions above, a recipient may use your version of this file under
0034  * the terms of any one of the MPL, the GPL or the LGPL.
0035  *
0036  * ***** END LICENSE BLOCK ***** */
0037 
0038 #ifndef CharDistribution_h__
0039 #define CharDistribution_h__
0040 
0041 #include "nscore.h"
0042 
0043 #define ENOUGH_DATA_THRESHOLD 1024
0044  
0045 class CharDistributionAnalysis
0046 {
0047 public:
0048   CharDistributionAnalysis() : mCharToFreqOrder(0), mTableSize(0)
0049     , mTypicalDistributionRatio(0.0) {Reset();}
0050   virtual ~CharDistributionAnalysis() {}
0051 
0052   //feed a block of data and do distribution analysis
0053   void HandleData(const char* aBuf, PRUint32 aLen) {(void)aBuf; (void)aLen;}
0054   
0055   //Feed a character with known length
0056   void HandleOneChar(const char* aStr, PRUint32 aCharLen)
0057   {
0058     PRInt32 order;
0059 
0060     //we only care about 2-bytes character in our distribution analysis
0061     order = (aCharLen == 2) ? GetOrder(aStr) : -1;
0062 
0063     if (order >= 0)
0064     {
0065       mTotalChars++;
0066       //order is valid
0067       if ((PRUint32)order < mTableSize)
0068       {
0069         if (512 > mCharToFreqOrder[order])
0070           mFreqChars++;
0071       }
0072     }
0073   };
0074 
0075   //return confidence base on existing data
0076   float GetConfidence();
0077 
0078   //Reset analyser, clear any state 
0079   void      Reset(void) 
0080   {
0081     mDone = PR_FALSE;
0082     mTotalChars = 0;
0083     mFreqChars = 0;
0084   };
0085 
0086   //This function is for future extension. Caller can use this function to control
0087   //analyser's behavior
0088   void      SetOpion(){}
0089 
0090   //It is not necessary to receive all data to draw conclusion. For charset detection,
0091   // certain amount of data is enough
0092   PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
0093 
0094 protected:
0095   //we do not handle character base on its original encoding string, but 
0096   //convert this encoding string to a number, here called order.
0097   //This allow multiple encoding of a language to share one frequency table 
0098   virtual PRInt32 GetOrder(const char* str) {(void)str; return -1;};
0099   
0100   //If this flag is set to PR_TRUE, detection is done and conclusion has been made
0101   PRBool   mDone;
0102 
0103   //The number of characters whose frequency order is less than 512
0104   PRUint32 mFreqChars;
0105 
0106   //Total character encounted.
0107   PRUint32 mTotalChars;
0108 
0109   //Mapping table to get frequency order from char order (get from GetOrder())
0110   const PRInt16  *mCharToFreqOrder;
0111 
0112   //Size of above table
0113   PRUint32 mTableSize;
0114 
0115   //This is a constant value varies from language to language, it is used in 
0116   //calculating confidence. See my paper for further detail.
0117   float    mTypicalDistributionRatio;
0118 };
0119 
0120 
0121 class EUCTWDistributionAnalysis: public CharDistributionAnalysis
0122 {
0123 public:
0124   EUCTWDistributionAnalysis();
0125 protected:
0126 
0127   //for euc-TW encoding, we are interested 
0128   //  first  byte range: 0xc4 -- 0xfe
0129   //  second byte range: 0xa1 -- 0xfe
0130   //no validation needed here. State machine has done that
0131   PRInt32 GetOrder(const char* str) 
0132   { if ((unsigned char)*str >= (unsigned char)0xc4)  
0133       return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
0134     else
0135       return -1;
0136   };
0137 };
0138 
0139 
0140 class EUCKRDistributionAnalysis : public CharDistributionAnalysis
0141 {
0142 public:
0143   EUCKRDistributionAnalysis();
0144 protected:
0145   //for euc-KR encoding, we are interested 
0146   //  first  byte range: 0xb0 -- 0xfe
0147   //  second byte range: 0xa1 -- 0xfe
0148   //no validation needed here. State machine has done that
0149   PRInt32 GetOrder(const char* str) 
0150   { if ((unsigned char)*str >= (unsigned char)0xb0)  
0151       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
0152     else
0153       return -1;
0154   };
0155 };
0156 
0157 class GB2312DistributionAnalysis : public CharDistributionAnalysis
0158 {
0159 public:
0160   GB2312DistributionAnalysis();
0161 protected:
0162   //for GB2312 encoding, we are interested 
0163   //  first  byte range: 0xb0 -- 0xfe
0164   //  second byte range: 0xa1 -- 0xfe
0165   //no validation needed here. State machine has done that
0166   PRInt32 GetOrder(const char* str) 
0167   { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)  
0168       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
0169     else
0170       return -1;
0171   };
0172 };
0173 
0174 
0175 class Big5DistributionAnalysis : public CharDistributionAnalysis
0176 {
0177 public:
0178   Big5DistributionAnalysis();
0179 protected:
0180   //for big5 encoding, we are interested 
0181   //  first  byte range: 0xa4 -- 0xfe
0182   //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
0183   //no validation needed here. State machine has done that
0184   PRInt32 GetOrder(const char* str) 
0185   { if ((unsigned char)*str >= (unsigned char)0xa4)  
0186       if ((unsigned char)str[1] >= (unsigned char)0xa1)
0187         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
0188       else
0189         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
0190     else
0191       return -1;
0192   };
0193 };
0194 
0195 class SJISDistributionAnalysis : public CharDistributionAnalysis
0196 {
0197 public:
0198   SJISDistributionAnalysis();
0199 protected:
0200   //for sjis encoding, we are interested 
0201   //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
0202   //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
0203   //no validation needed here. State machine has done that
0204   PRInt32 GetOrder(const char* str) 
0205   { 
0206     PRInt32 order;
0207     if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)  
0208       order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
0209     else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)  
0210       order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
0211     else
0212       return -1;
0213     order += (unsigned char)*(str+1) - 0x40;
0214     if ((unsigned char)str[1] > (unsigned char)0x7f)
0215       order--;
0216     return order;
0217   };
0218 };
0219 
0220 class EUCJPDistributionAnalysis : public CharDistributionAnalysis
0221 {
0222 public:
0223   EUCJPDistributionAnalysis();
0224 protected:
0225   //for euc-JP encoding, we are interested 
0226   //  first  byte range: 0xa0 -- 0xfe
0227   //  second byte range: 0xa1 -- 0xfe
0228   //no validation needed here. State machine has done that
0229   PRInt32 GetOrder(const char* str) 
0230   { if ((unsigned char)*str >= (unsigned char)0xa0)  
0231       return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
0232     else
0233       return -1;
0234   };
0235 };
0236 
0237 #endif //CharDistribution_h__
0238