File indexing completed on 2024-04-28 03:53:04

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #ifndef NSSBCHARSETPROBER_H
0008 #define NSSBCHARSETPROBER_H
0009 
0010 #include "nsCharSetProber.h"
0011 
0012 #define SAMPLE_SIZE 64
0013 #define SB_ENOUGH_REL_THRESHOLD 1024
0014 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
0015 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
0016 #define SYMBOL_CAT_ORDER 250
0017 #define NUMBER_OF_SEQ_CAT 4
0018 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT - 1)
0019 #define NEGATIVE_CAT 0
0020 
0021 namespace kencodingprober
0022 {
0023 typedef struct {
0024     const unsigned char *charToOrderMap; // [256] table use to find a char's order
0025     const char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
0026     float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
0027     bool keepEnglishLetter; // says if this script contains English characters (not implemented)
0028     const char *charsetName;
0029 } SequenceModel;
0030 
0031 class KCODECS_NO_EXPORT nsSingleByteCharSetProber : public nsCharSetProber
0032 {
0033 public:
0034     explicit nsSingleByteCharSetProber(const SequenceModel *model)
0035         : mModel(model)
0036         , mReversed(false)
0037         , mNameProber(nullptr)
0038     {
0039         Reset();
0040     }
0041     nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber *nameProber)
0042         : mModel(model)
0043         , mReversed(reversed)
0044         , mNameProber(nameProber)
0045     {
0046         Reset();
0047     }
0048 
0049     const char *GetCharSetName() override;
0050     nsProbingState HandleData(const char *aBuf, unsigned int aLen) override;
0051     nsProbingState GetState(void) override
0052     {
0053         return mState;
0054     }
0055     void Reset(void) override;
0056     float GetConfidence(void) override;
0057     void SetOpion() override
0058     {
0059     }
0060 
0061     // This feature is not implemented yet. any current language model
0062     // contain this parameter as false. No one is looking at this
0063     // parameter or calling this method.
0064     // Moreover, the nsSBCSGroupProber which calls the HandleData of this
0065     // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
0066     // of the English letters.
0067     bool KeepEnglishLetters()
0068     {
0069         return mModel->keepEnglishLetter;
0070     } // (not implemented)
0071 
0072 #ifdef DEBUG_PROBE
0073     void DumpStatus() override;
0074 #endif
0075 
0076 protected:
0077     nsProbingState mState;
0078     const SequenceModel *mModel;
0079     const bool mReversed; // true if we need to reverse every pair in the model lookup
0080 
0081     // char order of last character
0082     unsigned char mLastOrder;
0083 
0084     unsigned int mTotalSeqs;
0085     unsigned int mSeqCounters[NUMBER_OF_SEQ_CAT];
0086 
0087     unsigned int mTotalChar;
0088     // characters that fall in our sampling range
0089     unsigned int mFreqChar;
0090 
0091     // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
0092     nsCharSetProber *mNameProber;
0093 };
0094 
0095 extern const SequenceModel Koi8rModel;
0096 extern const SequenceModel Win1251Model;
0097 extern const SequenceModel Latin5Model;
0098 extern const SequenceModel MacCyrillicModel;
0099 extern const SequenceModel Ibm866Model;
0100 extern const SequenceModel Ibm855Model;
0101 extern const SequenceModel Latin7Model;
0102 extern const SequenceModel Win1253Model;
0103 extern const SequenceModel Latin5BulgarianModel;
0104 extern const SequenceModel Win1251BulgarianModel;
0105 extern const SequenceModel Latin2HungarianModel;
0106 extern const SequenceModel Win1250HungarianModel;
0107 extern const SequenceModel Win1255Model;
0108 }
0109 #endif /* NSSBCHARSETPROBER_H */