File indexing completed on 2024-12-22 04:33:42

0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
0002 /* ***** BEGIN LICENSE BLOCK *****
0003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
0004  *
0005  * The contents of this file are subject to the Mozilla Public License Version
0006  * 1.1 (the "License"); you may not use this file except in compliance with
0007  * the License. You may obtain a copy of the License at
0008  * http://www.mozilla.org/MPL/
0009  *
0010  * Software distributed under the License is distributed on an "AS IS" basis,
0011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
0012  * for the specific language governing rights and limitations under the
0013  * License.
0014  *
0015  * The Original Code is Mozilla Universal charset detector code.
0016  *
0017  * The Initial Developer of the Original Code is
0018  * Netscape Communications Corporation.
0019  * Portions created by the Initial Developer are Copyright (C) 2001
0020  * the Initial Developer. All Rights Reserved.
0021  *
0022  * Contributor(s):
0023  *          Shy Shalom <shooshX@gmail.com>
0024  *
0025  * Alternatively, the contents of this file may be used under the terms of
0026  * either the GNU General Public License Version 2 or later (the "GPL"), or
0027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
0028  * in which case the provisions of the GPL or the LGPL are applicable instead
0029  * of those above. If you wish to allow use of your version of this file only
0030  * under the terms of either the GPL or the LGPL, and not to allow others to
0031  * use your version of this file under the terms of the MPL, indicate your
0032  * decision by deleting the provisions above and replace them with the notice
0033  * and other provisions required by the GPL or the LGPL. If you do not delete
0034  * the provisions above, a recipient may use your version of this file under
0035  * the terms of any one of the MPL, the GPL or the LGPL.
0036  *
0037  * ***** END LICENSE BLOCK ***** */
0038 
0039 #pragma GCC visibility push(hidden)
0040 
0041 #include <stdio.h>
0042 #include "prmem.h"
0043 
0044 #include "nsMBCSGroupProber.h"
0045 
0046 #ifdef DEBUG_chardet
0047 char *ProberName[] = 
0048 {
0049   "UTF8",
0050   "SJIS",
0051   "EUCJP",
0052   "GB18030",
0053   "EUCKR",
0054   "Big5",
0055   "EUCTW",
0056 };
0057 
0058 #endif
0059 
0060 nsMBCSGroupProber::nsMBCSGroupProber()
0061 {
0062   mProbers[0] = new nsUTF8Prober();
0063   mProbers[1] = new nsSJISProber();
0064   mProbers[2] = new nsEUCJPProber();
0065   mProbers[3] = new nsGB18030Prober();
0066   mProbers[4] = new nsEUCKRProber();
0067   mProbers[5] = new nsBig5Prober();
0068   mProbers[6] = new nsEUCTWProber();
0069   Reset();
0070 }
0071 
0072 nsMBCSGroupProber::~nsMBCSGroupProber()
0073 {
0074   for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
0075   {
0076     delete mProbers[i];
0077   }
0078 }
0079 
0080 const char* nsMBCSGroupProber::GetCharSetName()
0081 {
0082   if (mBestGuess == -1)
0083   {
0084     GetConfidence();
0085     if (mBestGuess == -1)
0086       mBestGuess = 0;
0087   }
0088   return mProbers[mBestGuess]->GetCharSetName();
0089 }
0090 
0091 void  nsMBCSGroupProber::Reset(void)
0092 {
0093   mActiveNum = 0;
0094   for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
0095   {
0096     if (mProbers[i])
0097     {
0098       mProbers[i]->Reset();
0099       mIsActive[i] = PR_TRUE;
0100       ++mActiveNum;
0101     }
0102     else
0103       mIsActive[i] = PR_FALSE;
0104   }
0105   mBestGuess = -1;
0106   mState = eDetecting;
0107 }
0108 
0109 nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
0110 {
0111   nsProbingState st;
0112   PRUint32 i;
0113 
0114   //do filtering to reduce load to probers
0115   char *highbyteBuf;
0116   char *hptr;
0117   PRBool keepNext = PR_TRUE;   //assume previous is not ascii, it will do no harm except add some noise
0118   hptr = highbyteBuf = (char*)PR_Malloc(aLen);
0119   if (!hptr)
0120       return mState;
0121   for (i = 0; i < aLen; i++)
0122   {
0123     if (aBuf[i] & 0x80)
0124     {
0125       *hptr++ = aBuf[i];
0126       keepNext = PR_TRUE;
0127     }
0128     else
0129     {
0130       //if previous is highbyte, keep this even it is a ASCII
0131       if (keepNext)
0132       {
0133           *hptr++ = aBuf[i];
0134           keepNext = PR_FALSE;
0135       }
0136     }
0137   }
0138 
0139   for (i = 0; i < NUM_OF_PROBERS; i++)
0140   {
0141      if (!mIsActive[i])
0142        continue;
0143      st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
0144      if (st == eFoundIt)
0145      {
0146        mBestGuess = i;
0147        mState = eFoundIt;
0148        break;
0149      }
0150      else if (st == eNotMe)
0151      {
0152        mIsActive[i] = PR_FALSE;
0153        mActiveNum--;
0154        if (mActiveNum <= 0)
0155        {
0156          mState = eNotMe;
0157          break;
0158        }
0159      }
0160   }
0161 
0162   PR_FREEIF(highbyteBuf);
0163 
0164   return mState;
0165 }
0166 
0167 float nsMBCSGroupProber::GetConfidence(void)
0168 {
0169   PRUint32 i;
0170   float bestConf = 0.0, cf;
0171 
0172   switch (mState)
0173   {
0174   case eFoundIt:
0175     return (float)0.99;
0176   case eNotMe:
0177     return (float)0.01;
0178   default:
0179     for (i = 0; i < NUM_OF_PROBERS; i++)
0180     {
0181       if (!mIsActive[i])
0182         continue;
0183       cf = mProbers[i]->GetConfidence();
0184       if (bestConf < cf)
0185       {
0186         bestConf = cf;
0187         mBestGuess = i;
0188       }
0189     }
0190   }
0191   return bestConf;
0192 }
0193 
0194 #ifdef DEBUG_chardet
0195 void nsMBCSGroupProber::DumpStatus()
0196 {
0197   PRUint32 i;
0198   float cf;
0199   
0200   GetConfidence();
0201   for (i = 0; i < NUM_OF_PROBERS; i++)
0202   {
0203     if (!mIsActive[i])
0204       printf("  MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
0205     else
0206     {
0207       cf = mProbers[i]->GetConfidence();
0208       printf("  MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
0209     }
0210   }
0211 }
0212 #endif
0213 
0214 #pragma GCC visibility pop
0215