File indexing completed on 2024-04-28 03:53:04
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #ifndef NSSBCHARSETPROBER_H 0008 #define NSSBCHARSETPROBER_H 0009 0010 #include "nsCharSetProber.h" 0011 0012 #define SAMPLE_SIZE 64 0013 #define SB_ENOUGH_REL_THRESHOLD 1024 0014 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 0015 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 0016 #define SYMBOL_CAT_ORDER 250 0017 #define NUMBER_OF_SEQ_CAT 4 0018 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT - 1) 0019 #define NEGATIVE_CAT 0 0020 0021 namespace kencodingprober 0022 { 0023 typedef struct { 0024 const unsigned char *charToOrderMap; // [256] table use to find a char's order 0025 const char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency 0026 float mTypicalPositiveRatio; // = freqSeqs / totalSeqs 0027 bool keepEnglishLetter; // says if this script contains English characters (not implemented) 0028 const char *charsetName; 0029 } SequenceModel; 0030 0031 class KCODECS_NO_EXPORT nsSingleByteCharSetProber : public nsCharSetProber 0032 { 0033 public: 0034 explicit nsSingleByteCharSetProber(const SequenceModel *model) 0035 : mModel(model) 0036 , mReversed(false) 0037 , mNameProber(nullptr) 0038 { 0039 Reset(); 0040 } 0041 nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber *nameProber) 0042 : mModel(model) 0043 , mReversed(reversed) 0044 , mNameProber(nameProber) 0045 { 0046 Reset(); 0047 } 0048 0049 const char *GetCharSetName() override; 0050 nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; 0051 nsProbingState GetState(void) override 0052 { 0053 return mState; 0054 } 0055 void Reset(void) override; 0056 float GetConfidence(void) override; 0057 void SetOpion() override 0058 { 0059 } 0060 0061 // This feature is not implemented yet. any current language model 0062 // contain this parameter as false. No one is looking at this 0063 // parameter or calling this method. 0064 // Moreover, the nsSBCSGroupProber which calls the HandleData of this 0065 // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid 0066 // of the English letters. 0067 bool KeepEnglishLetters() 0068 { 0069 return mModel->keepEnglishLetter; 0070 } // (not implemented) 0071 0072 #ifdef DEBUG_PROBE 0073 void DumpStatus() override; 0074 #endif 0075 0076 protected: 0077 nsProbingState mState; 0078 const SequenceModel *mModel; 0079 const bool mReversed; // true if we need to reverse every pair in the model lookup 0080 0081 // char order of last character 0082 unsigned char mLastOrder; 0083 0084 unsigned int mTotalSeqs; 0085 unsigned int mSeqCounters[NUMBER_OF_SEQ_CAT]; 0086 0087 unsigned int mTotalChar; 0088 // characters that fall in our sampling range 0089 unsigned int mFreqChar; 0090 0091 // Optional auxiliary prober for name decision. created and destroyed by the GroupProber 0092 nsCharSetProber *mNameProber; 0093 }; 0094 0095 extern const SequenceModel Koi8rModel; 0096 extern const SequenceModel Win1251Model; 0097 extern const SequenceModel Latin5Model; 0098 extern const SequenceModel MacCyrillicModel; 0099 extern const SequenceModel Ibm866Model; 0100 extern const SequenceModel Ibm855Model; 0101 extern const SequenceModel Latin7Model; 0102 extern const SequenceModel Win1253Model; 0103 extern const SequenceModel Latin5BulgarianModel; 0104 extern const SequenceModel Win1251BulgarianModel; 0105 extern const SequenceModel Latin2HungarianModel; 0106 extern const SequenceModel Win1250HungarianModel; 0107 extern const SequenceModel Win1255Model; 0108 } 0109 #endif /* NSSBCHARSETPROBER_H */