File indexing completed on 2024-04-28 03:53:02

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #ifndef CharDistribution_h__
0008 #define CharDistribution_h__
0009 
0010 #include "kcodecs_export.h"
0011 
0012 #include <qcompilerdetection.h>
0013 
0014 #define ENOUGH_DATA_THRESHOLD 256
0015 
0016 namespace kencodingprober
0017 {
0018 class KCODECS_NO_EXPORT CharDistributionAnalysis
0019 {
0020 public:
0021     CharDistributionAnalysis()
0022     {
0023         Reset();
0024     }
0025     virtual ~CharDistributionAnalysis()
0026     {
0027     }
0028 
0029     // feed a block of data and do distribution analysis
0030     void HandleData(const char * /* aBuf */, unsigned int /* aLen */)
0031     {
0032     }
0033 
0034     // Feed a character with known length
0035     void HandleOneChar(const char *aStr, unsigned int aCharLen)
0036     {
0037         int order;
0038 
0039         // we only care about 2-bytes character in our distribution analysis
0040         order = (aCharLen == 2) ? GetOrder(aStr) : -1;
0041 
0042         if (order >= 0) {
0043             mTotalChars++;
0044             // order is valid
0045             if ((unsigned int)order < mTableSize) {
0046                 if (512 > mCharToFreqOrder[order]) {
0047                     mFreqChars++;
0048                 }
0049             }
0050         }
0051     }
0052 
0053     // return confidence base on existing data
0054     float GetConfidence();
0055 
0056     // Reset analyser, clear any state
0057     void Reset(void)
0058     {
0059         mDone = false;
0060         mTotalChars = 0;
0061         mFreqChars = 0;
0062     }
0063 
0064     // This function is for future extension. Caller can use this function to control
0065     // analyser's behavior
0066     void SetOpion()
0067     {
0068     }
0069 
0070     // It is not necessary to receive all data to draw conclusion. For charset detection,
0071     // certain amount of data is enough
0072     bool GotEnoughData()
0073     {
0074         return mTotalChars > ENOUGH_DATA_THRESHOLD;
0075     }
0076 
0077 protected:
0078     // we do not handle character base on its original encoding string, but
0079     // convert this encoding string to a number, here called order.
0080     // This allows multiple encodings of a language to share one frequency table
0081     virtual int GetOrder(const char * /* str */)
0082     {
0083         return -1;
0084     }
0085 
0086     // If this flag is set to true, detection is done and conclusion has been made
0087     bool mDone;
0088 
0089     // The number of characters whose frequency order is less than 512
0090     unsigned int mFreqChars;
0091 
0092     // Total character encountered.
0093     unsigned int mTotalChars;
0094 
0095     // Mapping table to get frequency order from char order (get from GetOrder())
0096     const short *mCharToFreqOrder;
0097 
0098     // Size of above table
0099     unsigned int mTableSize;
0100 
0101     // This is a constant value varies from language to language, it is used in
0102     // calculating confidence. See my paper for further detail.
0103     float mTypicalDistributionRatio;
0104 };
0105 
0106 class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
0107 {
0108 public:
0109     EUCKRDistributionAnalysis();
0110 
0111 protected:
0112     // for euc-KR encoding, we are interested
0113     //  first  byte range: 0xb0 -- 0xfe
0114     //  second byte range: 0xa1 -- 0xfe
0115     // no validation needed here. State machine has done that
0116     int GetOrder(const char *str) override
0117     {
0118         if ((unsigned char)*str >= (unsigned char)0xb0) {
0119             return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
0120         } else {
0121             return -1;
0122         }
0123     }
0124 };
0125 
0126 class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
0127 {
0128 public:
0129     GB2312DistributionAnalysis();
0130 
0131 protected:
0132     // for GB2312 encoding, we are interested
0133     //  first  byte range: 0xb0 -- 0xfe
0134     //  second byte range: 0xa1 -- 0xfe
0135     // no validation needed here. State machine has done that
0136     int GetOrder(const char *str) override
0137     {
0138         if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) {
0139             return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
0140         } else {
0141             return -1;
0142         }
0143     }
0144 };
0145 
0146 class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
0147 {
0148 public:
0149     Big5DistributionAnalysis();
0150 
0151 protected:
0152     // for big5 encoding, we are interested
0153     //  first  byte range: 0xa4 -- 0xfe
0154     //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
0155     // no validation needed here. State machine has done that
0156     int GetOrder(const char *str) override
0157     {
0158         if ((unsigned char)*str >= (unsigned char)0xa4)
0159             if ((unsigned char)str[1] >= (unsigned char)0xa1) {
0160                 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63;
0161             } else {
0162                 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
0163             }
0164         else {
0165             return -1;
0166         }
0167     }
0168 };
0169 
0170 class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
0171 {
0172 public:
0173     SJISDistributionAnalysis();
0174 
0175 protected:
0176     // for sjis encoding, we are interested
0177     //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
0178     //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
0179     // no validation needed here. State machine has done that
0180     int GetOrder(const char *str) override
0181     {
0182         int order;
0183         if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) {
0184             order = 188 * ((unsigned char)str[0] - (unsigned char)0x81);
0185         } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) {
0186             order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31);
0187         } else {
0188             return -1;
0189         }
0190         order += (unsigned char)*(str + 1) - 0x40;
0191         if ((unsigned char)str[1] > (unsigned char)0x7f) {
0192             order--;
0193         }
0194         return order;
0195     }
0196 };
0197 
0198 class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
0199 {
0200 public:
0201     EUCJPDistributionAnalysis();
0202 
0203 protected:
0204     // for euc-JP encoding, we are interested
0205     //  first  byte range: 0xa0 -- 0xfe
0206     //  second byte range: 0xa1 -- 0xfe
0207     // no validation needed here. State machine has done that
0208     int GetOrder(const char *str) override
0209     {
0210         if ((unsigned char)*str >= (unsigned char)0xa0) {
0211             return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
0212         } else {
0213             return -1;
0214         }
0215     }
0216 };
0217 }
0218 #endif // CharDistribution_h__