File indexing completed on 2024-12-22 04:33:42

0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
0002 /* ***** BEGIN LICENSE BLOCK *****
0003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
0004  *
0005  * The contents of this file are subject to the Mozilla Public License Version
0006  * 1.1 (the "License"); you may not use this file except in compliance with
0007  * the License. You may obtain a copy of the License at
0008  * http://www.mozilla.org/MPL/
0009  *
0010  * Software distributed under the License is distributed on an "AS IS" basis,
0011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
0012  * for the specific language governing rights and limitations under the
0013  * License.
0014  *
0015  * The Original Code is Mozilla Universal charset detector code.
0016  *
0017  * The Initial Developer of the Original Code is
0018  * Netscape Communications Corporation.
0019  * Portions created by the Initial Developer are Copyright (C) 2001
0020  * the Initial Developer. All Rights Reserved.
0021  *
0022  * Contributor(s):
0023  *          Shy Shalom <shooshX@gmail.com>
0024  *
0025  * Alternatively, the contents of this file may be used under the terms of
0026  * either the GNU General Public License Version 2 or later (the "GPL"), or
0027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
0028  * in which case the provisions of the GPL or the LGPL are applicable instead
0029  * of those above. If you wish to allow use of your version of this file only
0030  * under the terms of either the GPL or the LGPL, and not to allow others to
0031  * use your version of this file under the terms of the MPL, indicate your
0032  * decision by deleting the provisions above and replace them with the notice
0033  * and other provisions required by the GPL or the LGPL. If you do not delete
0034  * the provisions above, a recipient may use your version of this file under
0035  * the terms of any one of the MPL, the GPL or the LGPL.
0036  *
0037  * ***** END LICENSE BLOCK ***** */
0038 
0039 #pragma GCC visibility push(hidden)
0040 
0041 #include <stdio.h>
0042 #include "prmem.h"
0043 
0044 #include "nsSBCharSetProber.h"
0045 #include "nsSBCSGroupProber.h"
0046 
0047 #include "nsHebrewProber.h"
0048 
0049 nsSBCSGroupProber::nsSBCSGroupProber()
0050 {
0051   mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
0052   mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
0053   mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
0054   mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
0055   mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
0056   mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
0057   mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
0058   mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
0059   mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
0060   mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
0061 
0062   nsHebrewProber *hebprober = new nsHebrewProber();
0063   // Notice: Any change in these indexes - 10,11,12 must be reflected
0064   // in the code below as well.
0065   mProbers[10] = hebprober;
0066   mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
0067   mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
0068   // Tell the Hebrew prober about the logical and visual probers
0069   if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null
0070   {
0071     hebprober->SetModelProbers(mProbers[11], mProbers[12]);
0072   }
0073   else // One or more is null. avoid any Hebrew probing, null them all
0074   {
0075     for (PRUint32 i = 10; i <= 12; ++i)
0076     { 
0077       delete mProbers[i]; 
0078       mProbers[i] = 0; 
0079     }
0080   }
0081 
0082   // disable latin2 before latin1 is available, otherwise all latin1 
0083   // will be detected as latin2 because of their similarity.
0084   //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
0085   //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
0086 
0087   Reset();
0088 }
0089 
0090 nsSBCSGroupProber::~nsSBCSGroupProber()
0091 {
0092   for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
0093   {
0094     delete mProbers[i];
0095   }
0096 }
0097 
0098 
0099 const char* nsSBCSGroupProber::GetCharSetName()
0100 {
0101   //if we have no answer yet
0102   if (mBestGuess == -1)
0103   {
0104     GetConfidence();
0105     //no charset seems positive
0106     if (mBestGuess == -1)
0107       //we will use default.
0108       mBestGuess = 0;
0109   }
0110   return mProbers[mBestGuess]->GetCharSetName();
0111 }
0112 
0113 void  nsSBCSGroupProber::Reset(void)
0114 {
0115   mActiveNum = 0;
0116   for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
0117   {
0118     if (mProbers[i]) // not null
0119     {
0120       mProbers[i]->Reset();
0121       mIsActive[i] = PR_TRUE;
0122       ++mActiveNum;
0123     }
0124     else
0125       mIsActive[i] = PR_FALSE;
0126   }
0127   mBestGuess = -1;
0128   mState = eDetecting;
0129 }
0130 
0131 
0132 nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
0133 {
0134   nsProbingState st;
0135   PRUint32 i;
0136   char *newBuf1 = 0;
0137   PRUint32 newLen1 = 0;
0138 
0139   //apply filter to original buffer, and we got new buffer back
0140   //depend on what script it is, we will feed them the new buffer 
0141   //we got after applying proper filter
0142   //this is done without any consideration to KeepEnglishLetters
0143   //of each prober since as of now, there are no probers here which
0144   //recognize languages with English characters.
0145   if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1))
0146     goto done;
0147   
0148   if (newLen1 == 0)
0149     goto done; // Nothing to see here, move on.
0150 
0151   for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
0152   {
0153      if (!mIsActive[i])
0154        continue;
0155      st = mProbers[i]->HandleData(newBuf1, newLen1);
0156      if (st == eFoundIt)
0157      {
0158        mBestGuess = i;
0159        mState = eFoundIt;
0160        break;
0161      }
0162      else if (st == eNotMe)
0163      {
0164        mIsActive[i] = PR_FALSE;
0165        mActiveNum--;
0166        if (mActiveNum <= 0)
0167        {
0168          mState = eNotMe;
0169          break;
0170        }
0171      }
0172   }
0173 
0174 done:
0175   PR_FREEIF(newBuf1);
0176 
0177   return mState;
0178 }
0179 
0180 float nsSBCSGroupProber::GetConfidence(void)
0181 {
0182   PRUint32 i;
0183   float bestConf = 0.0, cf;
0184 
0185   switch (mState)
0186   {
0187   case eFoundIt:
0188     return (float)0.99; //sure yes
0189   case eNotMe:
0190     return (float)0.01;  //sure no
0191   default:
0192     for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
0193     {
0194       if (!mIsActive[i])
0195         continue;
0196       cf = mProbers[i]->GetConfidence();
0197       if (bestConf < cf)
0198       {
0199         bestConf = cf;
0200         mBestGuess = i;
0201       }
0202     }
0203   }
0204   return bestConf;
0205 }
0206 
0207 #ifdef DEBUG_chardet
0208 void nsSBCSGroupProber::DumpStatus()
0209 {
0210   PRUint32 i;
0211   float cf;
0212   
0213   cf = GetConfidence();
0214   printf(" SBCS Group Prober --------begin status \r\n");
0215   for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
0216   {
0217     if (!mIsActive[i])
0218       printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
0219     else
0220       mProbers[i]->DumpStatus();
0221   }
0222   printf(" SBCS Group found best match [%s] confidence %f.\r\n",  
0223          mProbers[mBestGuess]->GetCharSetName(), cf);
0224 }
0225 #endif
0226 
0227 #pragma GCC visibility pop
0228