File indexing completed on 2024-12-22 04:33:42
0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 0002 /* ***** BEGIN LICENSE BLOCK ***** 0003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 0004 * 0005 * The contents of this file are subject to the Mozilla Public License Version 0006 * 1.1 (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * http://www.mozilla.org/MPL/ 0009 * 0010 * Software distributed under the License is distributed on an "AS IS" basis, 0011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 0012 * for the specific language governing rights and limitations under the 0013 * License. 0014 * 0015 * The Original Code is Mozilla Universal charset detector code. 0016 * 0017 * The Initial Developer of the Original Code is 0018 * Netscape Communications Corporation. 0019 * Portions created by the Initial Developer are Copyright (C) 2001 0020 * the Initial Developer. All Rights Reserved. 0021 * 0022 * Contributor(s): 0023 * Shy Shalom <shooshX@gmail.com> 0024 * 0025 * Alternatively, the contents of this file may be used under the terms of 0026 * either the GNU General Public License Version 2 or later (the "GPL"), or 0027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 0028 * in which case the provisions of the GPL or the LGPL are applicable instead 0029 * of those above. If you wish to allow use of your version of this file only 0030 * under the terms of either the GPL or the LGPL, and not to allow others to 0031 * use your version of this file under the terms of the MPL, indicate your 0032 * decision by deleting the provisions above and replace them with the notice 0033 * and other provisions required by the GPL or the LGPL. If you do not delete 0034 * the provisions above, a recipient may use your version of this file under 0035 * the terms of any one of the MPL, the GPL or the LGPL. 0036 * 0037 * ***** END LICENSE BLOCK ***** */ 0038 0039 #pragma GCC visibility push(hidden) 0040 0041 #include <stdio.h> 0042 #include "prmem.h" 0043 0044 #include "nsMBCSGroupProber.h" 0045 0046 #ifdef DEBUG_chardet 0047 char *ProberName[] = 0048 { 0049 "UTF8", 0050 "SJIS", 0051 "EUCJP", 0052 "GB18030", 0053 "EUCKR", 0054 "Big5", 0055 "EUCTW", 0056 }; 0057 0058 #endif 0059 0060 nsMBCSGroupProber::nsMBCSGroupProber() 0061 { 0062 mProbers[0] = new nsUTF8Prober(); 0063 mProbers[1] = new nsSJISProber(); 0064 mProbers[2] = new nsEUCJPProber(); 0065 mProbers[3] = new nsGB18030Prober(); 0066 mProbers[4] = new nsEUCKRProber(); 0067 mProbers[5] = new nsBig5Prober(); 0068 mProbers[6] = new nsEUCTWProber(); 0069 Reset(); 0070 } 0071 0072 nsMBCSGroupProber::~nsMBCSGroupProber() 0073 { 0074 for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 0075 { 0076 delete mProbers[i]; 0077 } 0078 } 0079 0080 const char* nsMBCSGroupProber::GetCharSetName() 0081 { 0082 if (mBestGuess == -1) 0083 { 0084 GetConfidence(); 0085 if (mBestGuess == -1) 0086 mBestGuess = 0; 0087 } 0088 return mProbers[mBestGuess]->GetCharSetName(); 0089 } 0090 0091 void nsMBCSGroupProber::Reset(void) 0092 { 0093 mActiveNum = 0; 0094 for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) 0095 { 0096 if (mProbers[i]) 0097 { 0098 mProbers[i]->Reset(); 0099 mIsActive[i] = PR_TRUE; 0100 ++mActiveNum; 0101 } 0102 else 0103 mIsActive[i] = PR_FALSE; 0104 } 0105 mBestGuess = -1; 0106 mState = eDetecting; 0107 } 0108 0109 nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) 0110 { 0111 nsProbingState st; 0112 PRUint32 i; 0113 0114 //do filtering to reduce load to probers 0115 char *highbyteBuf; 0116 char *hptr; 0117 PRBool keepNext = PR_TRUE; //assume previous is not ascii, it will do no harm except add some noise 0118 hptr = highbyteBuf = (char*)PR_Malloc(aLen); 0119 if (!hptr) 0120 return mState; 0121 for (i = 0; i < aLen; i++) 0122 { 0123 if (aBuf[i] & 0x80) 0124 { 0125 *hptr++ = aBuf[i]; 0126 keepNext = PR_TRUE; 0127 } 0128 else 0129 { 0130 //if previous is highbyte, keep this even it is a ASCII 0131 if (keepNext) 0132 { 0133 *hptr++ = aBuf[i]; 0134 keepNext = PR_FALSE; 0135 } 0136 } 0137 } 0138 0139 for (i = 0; i < NUM_OF_PROBERS; i++) 0140 { 0141 if (!mIsActive[i]) 0142 continue; 0143 st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 0144 if (st == eFoundIt) 0145 { 0146 mBestGuess = i; 0147 mState = eFoundIt; 0148 break; 0149 } 0150 else if (st == eNotMe) 0151 { 0152 mIsActive[i] = PR_FALSE; 0153 mActiveNum--; 0154 if (mActiveNum <= 0) 0155 { 0156 mState = eNotMe; 0157 break; 0158 } 0159 } 0160 } 0161 0162 PR_FREEIF(highbyteBuf); 0163 0164 return mState; 0165 } 0166 0167 float nsMBCSGroupProber::GetConfidence(void) 0168 { 0169 PRUint32 i; 0170 float bestConf = 0.0, cf; 0171 0172 switch (mState) 0173 { 0174 case eFoundIt: 0175 return (float)0.99; 0176 case eNotMe: 0177 return (float)0.01; 0178 default: 0179 for (i = 0; i < NUM_OF_PROBERS; i++) 0180 { 0181 if (!mIsActive[i]) 0182 continue; 0183 cf = mProbers[i]->GetConfidence(); 0184 if (bestConf < cf) 0185 { 0186 bestConf = cf; 0187 mBestGuess = i; 0188 } 0189 } 0190 } 0191 return bestConf; 0192 } 0193 0194 #ifdef DEBUG_chardet 0195 void nsMBCSGroupProber::DumpStatus() 0196 { 0197 PRUint32 i; 0198 float cf; 0199 0200 GetConfidence(); 0201 for (i = 0; i < NUM_OF_PROBERS; i++) 0202 { 0203 if (!mIsActive[i]) 0204 printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 0205 else 0206 { 0207 cf = mProbers[i]->GetConfidence(); 0208 printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); 0209 } 0210 } 0211 } 0212 #endif 0213 0214 #pragma GCC visibility pop 0215