File indexing completed on 2025-01-05 04:27:36
0001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 0002 /* ***** BEGIN LICENSE BLOCK ***** 0003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 0004 * 0005 * The contents of this file are subject to the Mozilla Public License Version 0006 * 1.1 (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * http://www.mozilla.org/MPL/ 0009 * 0010 * Software distributed under the License is distributed on an "AS IS" basis, 0011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 0012 * for the specific language governing rights and limitations under the 0013 * License. 0014 * 0015 * The Original Code is Mozilla Universal charset detector code. 0016 * 0017 * The Initial Developer of the Original Code is 0018 * Netscape Communications Corporation. 0019 * Portions created by the Initial Developer are Copyright (C) 2001 0020 * the Initial Developer. All Rights Reserved. 0021 * 0022 * Contributor(s): 0023 * Shy Shalom <shooshX@gmail.com> 0024 * 0025 * Alternatively, the contents of this file may be used under the terms of 0026 * either the GNU General Public License Version 2 or later (the "GPL"), or 0027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 0028 * in which case the provisions of the GPL or the LGPL are applicable instead 0029 * of those above. If you wish to allow use of your version of this file only 0030 * under the terms of either the GPL or the LGPL, and not to allow others to 0031 * use your version of this file under the terms of the MPL, indicate your 0032 * decision by deleting the provisions above and replace them with the notice 0033 * and other provisions required by the GPL or the LGPL. If you do not delete 0034 * the provisions above, a recipient may use your version of this file under 0035 * the terms of any one of the MPL, the GPL or the LGPL. 0036 * 0037 * ***** END LICENSE BLOCK ***** */ 0038 0039 #pragma GCC visibility push(hidden) 0040 0041 #include "nscore.h" 0042 0043 #include "nsUniversalDetector.h" 0044 0045 #include "nsMBCSGroupProber.h" 0046 #include "nsSBCSGroupProber.h" 0047 #include "nsEscCharsetProber.h" 0048 #include "nsLatin1Prober.h" 0049 0050 nsUniversalDetector::nsUniversalDetector() 0051 { 0052 mDone = PR_FALSE; 0053 mBestGuess = -1; //illegal value as signal 0054 mInTag = PR_FALSE; 0055 mEscCharSetProber = nsnull; 0056 0057 mStart = PR_TRUE; 0058 mDetectedCharset = nsnull; 0059 mGotData = PR_FALSE; 0060 mInputState = ePureAscii; 0061 mLastChar = '\0'; 0062 0063 PRUint32 i; 0064 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 0065 mCharSetProbers[i] = nsnull; 0066 } 0067 0068 nsUniversalDetector::~nsUniversalDetector() 0069 { 0070 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 0071 if (mCharSetProbers[i]) 0072 delete mCharSetProbers[i]; 0073 if (mEscCharSetProber) 0074 delete mEscCharSetProber; 0075 } 0076 0077 void 0078 nsUniversalDetector::Reset() 0079 { 0080 mDone = PR_FALSE; 0081 mBestGuess = -1; //illegal value as signal 0082 mInTag = PR_FALSE; 0083 0084 mStart = PR_TRUE; 0085 mDetectedCharset = nsnull; 0086 mGotData = PR_FALSE; 0087 mInputState = ePureAscii; 0088 mLastChar = '\0'; 0089 0090 if (mEscCharSetProber) 0091 mEscCharSetProber->Reset(); 0092 0093 PRUint32 i; 0094 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 0095 if (mCharSetProbers[i]) 0096 mCharSetProbers[i]->Reset(); 0097 } 0098 0099 //--------------------------------------------------------------------- 0100 #define SHORTCUT_THRESHOLD (float)0.95 0101 #define MINIMUM_THRESHOLD (float)0.20 0102 0103 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) 0104 { 0105 if(mDone) 0106 return NS_OK; 0107 0108 if (aLen > 0) 0109 mGotData = PR_TRUE; 0110 0111 //If the data starts with BOM, we know it is UTF 0112 if (mStart) 0113 { 0114 mStart = PR_FALSE; 0115 if (aLen > 3) 0116 switch (aBuf[0]) 0117 { 0118 case '\xEF': 0119 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 0120 // EF BB BF UTF-8 encoded BOM 0121 mDetectedCharset = "UTF-8"; 0122 break; 0123 case '\xFE': 0124 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 0125 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 0126 mDetectedCharset = "X-ISO-10646-UCS-4-3412"; 0127 else if ('\xFF' == aBuf[1]) 0128 // FE FF UTF-16, big endian BOM 0129 mDetectedCharset = "UTF-16BE"; 0130 break; 0131 case '\x00': 0132 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 0133 // 00 00 FE FF UTF-32, big-endian BOM 0134 mDetectedCharset = "UTF-32BE"; 0135 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 0136 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 0137 mDetectedCharset = "X-ISO-10646-UCS-4-2143"; 0138 break; 0139 case '\xFF': 0140 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 0141 // FF FE 00 00 UTF-32, little-endian BOM 0142 mDetectedCharset = "UTF-32LE"; 0143 else if ('\xFE' == aBuf[1]) 0144 // FF FE UTF-16, little endian BOM 0145 mDetectedCharset = "UTF-16LE"; 0146 break; 0147 } // switch 0148 0149 if (mDetectedCharset) 0150 { 0151 mDone = PR_TRUE; 0152 return NS_OK; 0153 } 0154 } 0155 0156 PRUint32 i; 0157 for (i = 0; i < aLen; i++) 0158 { 0159 //other than 0xa0, if every other character is ascii, the page is ascii 0160 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 0161 { 0162 //we got a non-ascii byte (high-byte) 0163 if (mInputState != eHighbyte) 0164 { 0165 //adjust state 0166 mInputState = eHighbyte; 0167 0168 //kill mEscCharSetProber if it is active 0169 if (mEscCharSetProber) { 0170 delete mEscCharSetProber; 0171 mEscCharSetProber = nsnull; 0172 } 0173 0174 //start multibyte and singlebyte charset prober 0175 if (nsnull == mCharSetProbers[0]) 0176 mCharSetProbers[0] = new nsMBCSGroupProber; 0177 if (nsnull == mCharSetProbers[1]) 0178 mCharSetProbers[1] = new nsSBCSGroupProber; 0179 if (nsnull == mCharSetProbers[2]) 0180 mCharSetProbers[2] = new nsLatin1Prober; 0181 0182 if ((nsnull == mCharSetProbers[0]) || 0183 (nsnull == mCharSetProbers[1]) || 0184 (nsnull == mCharSetProbers[2])) 0185 return NS_ERROR_OUT_OF_MEMORY; 0186 } 0187 } 0188 else 0189 { 0190 //ok, just pure ascii so far 0191 if ( ePureAscii == mInputState && 0192 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 0193 { 0194 //found escape character or HZ "~{" 0195 mInputState = eEscAscii; 0196 } 0197 mLastChar = aBuf[i]; 0198 } 0199 } 0200 0201 nsProbingState st; 0202 switch (mInputState) 0203 { 0204 case eEscAscii: 0205 if (nsnull == mEscCharSetProber) { 0206 mEscCharSetProber = new nsEscCharSetProber; 0207 if (nsnull == mEscCharSetProber) 0208 return NS_ERROR_OUT_OF_MEMORY; 0209 } 0210 st = mEscCharSetProber->HandleData(aBuf, aLen); 0211 if (st == eFoundIt) 0212 { 0213 mDone = PR_TRUE; 0214 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 0215 } 0216 break; 0217 case eHighbyte: 0218 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 0219 { 0220 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 0221 if (st == eFoundIt) 0222 { 0223 mDone = PR_TRUE; 0224 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 0225 return NS_OK; 0226 } 0227 } 0228 break; 0229 0230 default: //pure ascii 0231 ;//do nothing here 0232 } 0233 return NS_OK; 0234 } 0235 0236 0237 //--------------------------------------------------------------------- 0238 void nsUniversalDetector::DataEnd() 0239 { 0240 if (!mGotData) 0241 { 0242 // we haven't got any data yet, return immediately 0243 // caller program sometimes call DataEnd before anything has been sent to detector 0244 return; 0245 } 0246 0247 if (mDetectedCharset) 0248 { 0249 mDone = PR_TRUE; 0250 Report(mDetectedCharset); 0251 return; 0252 } 0253 0254 switch (mInputState) 0255 { 0256 case eHighbyte: 0257 { 0258 float proberConfidence; 0259 float maxProberConfidence = (float)0.0; 0260 PRInt32 maxProber = 0; 0261 0262 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 0263 { 0264 proberConfidence = mCharSetProbers[i]->GetConfidence(); 0265 if (proberConfidence > maxProberConfidence) 0266 { 0267 maxProberConfidence = proberConfidence; 0268 maxProber = i; 0269 } 0270 } 0271 //do not report anything because we are not confident of it, that's in fact a negative answer 0272 if (maxProberConfidence > MINIMUM_THRESHOLD) 0273 Report(mCharSetProbers[maxProber]->GetCharSetName()); 0274 } 0275 break; 0276 case eEscAscii: 0277 break; 0278 default: 0279 ; 0280 } 0281 return; 0282 } 0283 0284 #pragma GCC visibility pop 0285