File indexing completed on 2024-04-28 03:53:03

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #ifndef __JPCNTX_H__
0008 #define __JPCNTX_H__
0009 
0010 #include "kcodecs_export.h"
0011 
0012 #include <qglobal.h>
0013 
0014 #define NUM_OF_CATEGORY 6
0015 
0016 #define ENOUGH_REL_THRESHOLD 100
0017 #define MAX_REL_THRESHOLD 1000
0018 namespace kencodingprober
0019 {
0020 // hiragana frequency category table
0021 extern const char jp2CharContext[83][83];
0022 
0023 class KCODECS_NO_EXPORT JapaneseContextAnalysis
0024 {
0025 public:
0026     JapaneseContextAnalysis()
0027     {
0028         Reset();
0029     }
0030     virtual ~JapaneseContextAnalysis()
0031     {
0032     }
0033 
0034     void HandleData(const char *aBuf, unsigned int aLen);
0035 
0036     void HandleOneChar(const char *aStr, unsigned int aCharLen)
0037     {
0038         int order;
0039 
0040         // if we received enough data, stop here
0041         if (mTotalRel > MAX_REL_THRESHOLD) {
0042             mDone = true;
0043         }
0044         if (mDone) {
0045             return;
0046         }
0047 
0048         // Only 2-bytes characters are of our interest
0049         order = (aCharLen == 2) ? GetOrder(aStr) : -1;
0050         if (order != -1 && mLastCharOrder != -1) {
0051             mTotalRel++;
0052             // count this sequence to its category counter
0053             mRelSample[(int)jp2CharContext[mLastCharOrder][order]]++;
0054         }
0055         mLastCharOrder = order;
0056     }
0057 
0058     float GetConfidence();
0059     void Reset(void);
0060     void SetOpion()
0061     {
0062     }
0063     bool GotEnoughData()
0064     {
0065         return mTotalRel > ENOUGH_REL_THRESHOLD;
0066     }
0067 
0068 protected:
0069     virtual int GetOrder(const char *str, unsigned int *charLen) = 0;
0070     virtual int GetOrder(const char *str) = 0;
0071 
0072     // category counters, each integer counts sequence in its category
0073     unsigned int mRelSample[NUM_OF_CATEGORY];
0074 
0075     // total sequence received
0076     unsigned int mTotalRel;
0077 
0078     // The order of previous char
0079     int mLastCharOrder;
0080 
0081     // if last byte in current buffer is not the last byte of a character, we
0082     // need to know how many byte to skip in next buffer.
0083     unsigned int mNeedToSkipCharNum;
0084 
0085     // If this flag is set to true, detection is done and conclusion has been made
0086     bool mDone;
0087 };
0088 
0089 class KCODECS_NO_EXPORT SJISContextAnalysis : public JapaneseContextAnalysis
0090 {
0091     // SJISContextAnalysis(){};
0092 protected:
0093     int GetOrder(const char *str, unsigned int *charLen) override;
0094 
0095     int GetOrder(const char *str) override
0096     {
0097         // We only interested in Hiragana, so first byte is '\202'
0098         if (*str == '\202' && (unsigned char)*(str + 1) >= (unsigned char)0x9f && (unsigned char)*(str + 1) <= (unsigned char)0xf1) {
0099             return (unsigned char)*(str + 1) - (unsigned char)0x9f;
0100         }
0101         return -1;
0102     }
0103 };
0104 
0105 class KCODECS_NO_EXPORT EUCJPContextAnalysis : public JapaneseContextAnalysis
0106 {
0107 protected:
0108     int GetOrder(const char *str, unsigned int *charLen) override;
0109     int GetOrder(const char *str) override
0110     // We only interested in Hiragana, so first byte is '\244'
0111     {
0112         if (*str == '\244' //
0113             && (unsigned char)*(str + 1) >= (unsigned char)0xa1 //
0114             && (unsigned char)*(str + 1) <= (unsigned char)0xf3) {
0115             return (unsigned char)*(str + 1) - (unsigned char)0xa1;
0116         }
0117         return -1;
0118     }
0119 };
0120 }
0121 #endif /* __JPCNTX_H__ */