charset-detector/src/nsUniversalDetector.cpp

0001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
0002 /* ***** BEGIN LICENSE BLOCK *****
0003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
0004  *
0005  * The contents of this file are subject to the Mozilla Public License Version
0006  * 1.1 (the "License"); you may not use this file except in compliance with
0007  * the License. You may obtain a copy of the License at
0008  * http://www.mozilla.org/MPL/
0009  *
0010  * Software distributed under the License is distributed on an "AS IS" basis,
0011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
0012  * for the specific language governing rights and limitations under the
0013  * License.
0014  *
0015  * The Original Code is Mozilla Universal charset detector code.
0016  *
0017  * The Initial Developer of the Original Code is
0018  * Netscape Communications Corporation.
0019  * Portions created by the Initial Developer are Copyright (C) 2001
0020  * the Initial Developer. All Rights Reserved.
0021  *
0022  * Contributor(s):
0023  *          Shy Shalom <shooshX@gmail.com>
0024  *
0025  * Alternatively, the contents of this file may be used under the terms of
0026  * either the GNU General Public License Version 2 or later (the "GPL"), or
0027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
0028  * in which case the provisions of the GPL or the LGPL are applicable instead
0029  * of those above. If you wish to allow use of your version of this file only
0030  * under the terms of either the GPL or the LGPL, and not to allow others to
0031  * use your version of this file under the terms of the MPL, indicate your
0032  * decision by deleting the provisions above and replace them with the notice
0033  * and other provisions required by the GPL or the LGPL. If you do not delete
0034  * the provisions above, a recipient may use your version of this file under
0035  * the terms of any one of the MPL, the GPL or the LGPL.
0036  *
0037  * ***** END LICENSE BLOCK ***** */
0038
0039 #pragma GCC visibility push(hidden)
0040
0041 #include "nscore.h"
0042
0043 #include "nsUniversalDetector.h"
0044
0045 #include "nsMBCSGroupProber.h"
0046 #include "nsSBCSGroupProber.h"
0047 #include "nsEscCharsetProber.h"
0048 #include "nsLatin1Prober.h"
0049
0050 nsUniversalDetector::nsUniversalDetector()
0051 {
0052   mDone = PR_FALSE;
0053   mBestGuess = -1;   //illegal value as signal
0054   mInTag = PR_FALSE;
0055   mEscCharSetProber = nsnull;
0056
0057   mStart = PR_TRUE;
0058   mDetectedCharset = nsnull;
0059   mGotData = PR_FALSE;
0060   mInputState = ePureAscii;
0061   mLastChar = '\0';
0062
0063   PRUint32 i;
0064   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
0065     mCharSetProbers[i] = nsnull;
0066 }
0067
0068 nsUniversalDetector::~nsUniversalDetector()
0069 {
0070   for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
0071     if (mCharSetProbers[i])
0072       delete mCharSetProbers[i];
0073   if (mEscCharSetProber)
0074     delete mEscCharSetProber;
0075 }
0076
0077 void
0078 nsUniversalDetector::Reset()
0079 {
0080   mDone = PR_FALSE;
0081   mBestGuess = -1;   //illegal value as signal
0082   mInTag = PR_FALSE;
0083
0084   mStart = PR_TRUE;
0085   mDetectedCharset = nsnull;
0086   mGotData = PR_FALSE;
0087   mInputState = ePureAscii;
0088   mLastChar = '\0';
0089
0090   if (mEscCharSetProber)
0091     mEscCharSetProber->Reset();
0092
0093   PRUint32 i;
0094   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
0095     if (mCharSetProbers[i])
0096       mCharSetProbers[i]->Reset();
0097 }
0098
0099 //---------------------------------------------------------------------
0100 #define SHORTCUT_THRESHOLD      (float)0.95
0101 #define MINIMUM_THRESHOLD      (float)0.20
0102
0103 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
0104 {
0105   if(mDone)
0106     return NS_OK;
0107
0108   if (aLen > 0)
0109     mGotData = PR_TRUE;
0110
0111   //If the data starts with BOM, we know it is UTF
0112   if (mStart)
0113   {
0114     mStart = PR_FALSE;
0115     if (aLen > 3)
0116       switch (aBuf[0])
0117         {
0118         case '\xEF':
0119           if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
0120             // EF BB BF  UTF-8 encoded BOM
0121             mDetectedCharset = "UTF-8";
0122         break;
0123         case '\xFE':
0124           if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
0125             // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
0126             mDetectedCharset = "X-ISO-10646-UCS-4-3412";
0127           else if ('\xFF' == aBuf[1])
0128             // FE FF  UTF-16, big endian BOM
0129             mDetectedCharset = "UTF-16BE";
0130         break;
0131         case '\x00':
0132           if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
0133             // 00 00 FE FF  UTF-32, big-endian BOM
0134             mDetectedCharset = "UTF-32BE";
0135           else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
0136             // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
0137             mDetectedCharset = "X-ISO-10646-UCS-4-2143";
0138         break;
0139         case '\xFF':
0140           if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
0141             // FF FE 00 00  UTF-32, little-endian BOM
0142             mDetectedCharset = "UTF-32LE";
0143           else if ('\xFE' == aBuf[1])
0144             // FF FE  UTF-16, little endian BOM
0145             mDetectedCharset = "UTF-16LE";
0146         break;
0147       }  // switch
0148
0149       if (mDetectedCharset)
0150       {
0151         mDone = PR_TRUE;
0152         return NS_OK;
0153       }
0154   }
0155
0156   PRUint32 i;
0157   for (i = 0; i < aLen; i++)
0158   {
0159     //other than 0xa0, if every other character is ascii, the page is ascii
0160     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP
0161     {
0162       //we got a non-ascii byte (high-byte)
0163       if (mInputState != eHighbyte)
0164       {
0165         //adjust state
0166         mInputState = eHighbyte;
0167
0168         //kill mEscCharSetProber if it is active
0169         if (mEscCharSetProber) {
0170           delete mEscCharSetProber;
0171           mEscCharSetProber = nsnull;
0172         }
0173
0174         //start multibyte and singlebyte charset prober
0175         if (nsnull == mCharSetProbers[0])
0176           mCharSetProbers[0] = new nsMBCSGroupProber;
0177         if (nsnull == mCharSetProbers[1])
0178           mCharSetProbers[1] = new nsSBCSGroupProber;
0179         if (nsnull == mCharSetProbers[2])
0180           mCharSetProbers[2] = new nsLatin1Prober;
0181
0182         if ((nsnull == mCharSetProbers[0]) ||
0183             (nsnull == mCharSetProbers[1]) ||
0184             (nsnull == mCharSetProbers[2]))
0185             return NS_ERROR_OUT_OF_MEMORY;
0186       }
0187     }
0188     else
0189     {
0190       //ok, just pure ascii so far
0191       if ( ePureAscii == mInputState &&
0192         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
0193       {
0194         //found escape character or HZ "~{"
0195         mInputState = eEscAscii;
0196       }
0197       mLastChar = aBuf[i];
0198     }
0199   }
0200
0201   nsProbingState st;
0202   switch (mInputState)
0203   {
0204   case eEscAscii:
0205     if (nsnull == mEscCharSetProber) {
0206       mEscCharSetProber = new nsEscCharSetProber;
0207       if (nsnull == mEscCharSetProber)
0208         return NS_ERROR_OUT_OF_MEMORY;
0209     }
0210     st = mEscCharSetProber->HandleData(aBuf, aLen);
0211     if (st == eFoundIt)
0212     {
0213       mDone = PR_TRUE;
0214       mDetectedCharset = mEscCharSetProber->GetCharSetName();
0215     }
0216     break;
0217   case eHighbyte:
0218     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
0219     {
0220       st = mCharSetProbers[i]->HandleData(aBuf, aLen);
0221       if (st == eFoundIt)
0222       {
0223         mDone = PR_TRUE;
0224         mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
0225         return NS_OK;
0226       }
0227     }
0228     break;
0229
0230   default:  //pure ascii
0231     ;//do nothing here
0232   }
0233   return NS_OK;
0234 }
0235
0236
0237 //---------------------------------------------------------------------
0238 void nsUniversalDetector::DataEnd()
0239 {
0240   if (!mGotData)
0241   {
0242     // we haven't got any data yet, return immediately
0243     // caller program sometimes call DataEnd before anything has been sent to detector
0244     return;
0245   }
0246
0247   if (mDetectedCharset)
0248   {
0249     mDone = PR_TRUE;
0250     Report(mDetectedCharset);
0251     return;
0252   }
0253
0254   switch (mInputState)
0255   {
0256   case eHighbyte:
0257     {
0258       float proberConfidence;
0259       float maxProberConfidence = (float)0.0;
0260       PRInt32 maxProber = 0;
0261
0262       for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
0263       {
0264         proberConfidence = mCharSetProbers[i]->GetConfidence();
0265         if (proberConfidence > maxProberConfidence)
0266         {
0267           maxProberConfidence = proberConfidence;
0268           maxProber = i;
0269         }
0270       }
0271       //do not report anything because we are not confident of it, that's in fact a negative answer
0272       if (maxProberConfidence > MINIMUM_THRESHOLD)
0273         Report(mCharSetProbers[maxProber]->GetCharSetName());
0274     }
0275     break;
0276   case eEscAscii:
0277     break;
0278   default:
0279     ;
0280   }
0281   return;
0282 }
0283
0284 #pragma GCC visibility pop
0285