File indexing completed on 2024-04-28 03:53:04
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "UnicodeGroupProber.h" 0008 0009 #include <QChar> 0010 #include <math.h> 0011 0012 namespace kencodingprober 0013 { 0014 UnicodeGroupProber::UnicodeGroupProber(void) 0015 { 0016 mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel); 0017 mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel); 0018 mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel); 0019 mActiveSM = NUM_OF_UNICODE_CHARSETS; 0020 mState = eDetecting; 0021 mDetectedCharset = "UTF-8"; 0022 } 0023 0024 UnicodeGroupProber::~UnicodeGroupProber(void) 0025 { 0026 for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) { 0027 delete mCodingSM[i]; 0028 } 0029 } 0030 0031 void UnicodeGroupProber::Reset(void) 0032 { 0033 mState = eDetecting; 0034 for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) { 0035 mCodingSM[i]->Reset(); 0036 } 0037 mActiveSM = NUM_OF_UNICODE_CHARSETS; 0038 mDetectedCharset = "UTF-8"; 0039 } 0040 0041 nsProbingState UnicodeGroupProber::HandleData(const char *aBuf, unsigned int aLen) 0042 { 0043 nsSMState codingState; 0044 static bool disableUTF16LE = false; 0045 static bool disableUTF16BE = false; 0046 0047 if (mActiveSM == 0 || aLen < 2) { 0048 mState = eNotMe; 0049 return mState; 0050 } 0051 0052 if (!(disableUTF16LE || disableUTF16BE)) { 0053 if (aLen % 2 != 0) { 0054 disableUTF16LE = true; 0055 disableUTF16BE = true; 0056 } 0057 const uint weight_BOM = sqrt((double)aLen) + aLen / 10.0; 0058 uint counts[5] = {0, 0, 0, 0, 0}; 0059 for (uint i = 0; i < 5; i++) { 0060 counts[i] = std::count(aBuf, aBuf + aLen, char(i)); 0061 } 0062 const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen; 0063 if (weight_zero < log(1.4142)) { 0064 disableUTF16LE = true; 0065 disableUTF16BE = true; 0066 } 0067 if (4 >= aBuf[1] && aBuf[1] >= 0 && QChar::isPrint(static_cast<uint>(aBuf[0]))) { 0068 disableUTF16BE = true; 0069 } else { 0070 disableUTF16LE = true; 0071 } 0072 if (disableUTF16BE) { 0073 mActiveSM--; 0074 } 0075 if (disableUTF16LE) { 0076 nsCodingStateMachine *t; 0077 t = mCodingSM[1]; 0078 mCodingSM[1] = mCodingSM[2]; 0079 mCodingSM[2] = t; 0080 mActiveSM--; 0081 } 0082 } 0083 0084 for (uint i = 0; i < aLen; ++i) { 0085 for (int j = mActiveSM - 1; j >= 0; --j) { 0086 // byte is feed to all active state machine 0087 codingState = mCodingSM[j]->NextState(aBuf[i]); 0088 if (codingState == eError) { 0089 // got negative answer for this state machine, make it inactive 0090 mActiveSM--; 0091 if (mActiveSM == 0) { 0092 mState = eNotMe; 0093 return mState; 0094 } else if (j != (int)mActiveSM) { 0095 nsCodingStateMachine *t; 0096 t = mCodingSM[mActiveSM]; 0097 mCodingSM[mActiveSM] = mCodingSM[j]; 0098 mCodingSM[j] = t; 0099 } 0100 } else if (codingState == eItsMe) { 0101 mState = eFoundIt; 0102 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 0103 return mState; 0104 } else if (mState == eDetecting) { 0105 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); 0106 }; 0107 } 0108 } 0109 return mState; 0110 } 0111 0112 float UnicodeGroupProber::GetConfidence() 0113 { 0114 if (mState == eFoundIt) { 0115 return 0.99f; 0116 } else { 0117 return 0.0f; 0118 } 0119 } 0120 0121 #ifdef DEBUG_PROBE 0122 void UnicodeGroupProber::DumpStatus() 0123 { 0124 GetConfidence(); 0125 for (uint i = 0; i < mActiveSM; i++) { 0126 qDebug() << "Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine(); 0127 } 0128 } 0129 #endif 0130 0131 }