File indexing completed on 2024-04-28 03:53:04

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
0003 
0004     SPDX-License-Identifier: MIT
0005 */
0006 
0007 #include "UnicodeGroupProber.h"
0008 
0009 #include <QChar>
0010 #include <math.h>
0011 
0012 namespace kencodingprober
0013 {
0014 UnicodeGroupProber::UnicodeGroupProber(void)
0015 {
0016     mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel);
0017     mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel);
0018     mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel);
0019     mActiveSM = NUM_OF_UNICODE_CHARSETS;
0020     mState = eDetecting;
0021     mDetectedCharset = "UTF-8";
0022 }
0023 
0024 UnicodeGroupProber::~UnicodeGroupProber(void)
0025 {
0026     for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
0027         delete mCodingSM[i];
0028     }
0029 }
0030 
0031 void UnicodeGroupProber::Reset(void)
0032 {
0033     mState = eDetecting;
0034     for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
0035         mCodingSM[i]->Reset();
0036     }
0037     mActiveSM = NUM_OF_UNICODE_CHARSETS;
0038     mDetectedCharset = "UTF-8";
0039 }
0040 
0041 nsProbingState UnicodeGroupProber::HandleData(const char *aBuf, unsigned int aLen)
0042 {
0043     nsSMState codingState;
0044     static bool disableUTF16LE = false;
0045     static bool disableUTF16BE = false;
0046 
0047     if (mActiveSM == 0 || aLen < 2) {
0048         mState = eNotMe;
0049         return mState;
0050     }
0051 
0052     if (!(disableUTF16LE || disableUTF16BE)) {
0053         if (aLen % 2 != 0) {
0054             disableUTF16LE = true;
0055             disableUTF16BE = true;
0056         }
0057         const uint weight_BOM = sqrt((double)aLen) + aLen / 10.0;
0058         uint counts[5] = {0, 0, 0, 0, 0};
0059         for (uint i = 0; i < 5; i++) {
0060             counts[i] = std::count(aBuf, aBuf + aLen, char(i));
0061         }
0062         const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen;
0063         if (weight_zero < log(1.4142)) {
0064             disableUTF16LE = true;
0065             disableUTF16BE = true;
0066         }
0067         if (4 >= aBuf[1] && aBuf[1] >= 0 && QChar::isPrint(static_cast<uint>(aBuf[0]))) {
0068             disableUTF16BE = true;
0069         } else {
0070             disableUTF16LE = true;
0071         }
0072         if (disableUTF16BE) {
0073             mActiveSM--;
0074         }
0075         if (disableUTF16LE) {
0076             nsCodingStateMachine *t;
0077             t = mCodingSM[1];
0078             mCodingSM[1] = mCodingSM[2];
0079             mCodingSM[2] = t;
0080             mActiveSM--;
0081         }
0082     }
0083 
0084     for (uint i = 0; i < aLen; ++i) {
0085         for (int j = mActiveSM - 1; j >= 0; --j) {
0086             // byte is feed to all active state machine
0087             codingState = mCodingSM[j]->NextState(aBuf[i]);
0088             if (codingState == eError) {
0089                 // got negative answer for this state machine, make it inactive
0090                 mActiveSM--;
0091                 if (mActiveSM == 0) {
0092                     mState = eNotMe;
0093                     return mState;
0094                 } else if (j != (int)mActiveSM) {
0095                     nsCodingStateMachine *t;
0096                     t = mCodingSM[mActiveSM];
0097                     mCodingSM[mActiveSM] = mCodingSM[j];
0098                     mCodingSM[j] = t;
0099                 }
0100             } else if (codingState == eItsMe) {
0101                 mState = eFoundIt;
0102                 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
0103                 return mState;
0104             } else if (mState == eDetecting) {
0105                 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
0106             };
0107         }
0108     }
0109     return mState;
0110 }
0111 
0112 float UnicodeGroupProber::GetConfidence()
0113 {
0114     if (mState == eFoundIt) {
0115         return 0.99f;
0116     } else {
0117         return 0.0f;
0118     }
0119 }
0120 
0121 #ifdef DEBUG_PROBE
0122 void UnicodeGroupProber::DumpStatus()
0123 {
0124     GetConfidence();
0125     for (uint i = 0; i < mActiveSM; i++) {
0126         qDebug() << "Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine();
0127     }
0128 }
0129 #endif
0130 
0131 }