File indexing completed on 2025-01-05 04:27:35
0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 0002 /* ***** BEGIN LICENSE BLOCK ***** 0003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 0004 * 0005 * The contents of this file are subject to the Mozilla Public License Version 0006 * 1.1 (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * http://www.mozilla.org/MPL/ 0009 * 0010 * Software distributed under the License is distributed on an "AS IS" basis, 0011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 0012 * for the specific language governing rights and limitations under the 0013 * License. 0014 * 0015 * The Original Code is Mozilla Universal charset detector code. 0016 * 0017 * The Initial Developer of the Original Code is 0018 * Netscape Communications Corporation. 0019 * Portions created by the Initial Developer are Copyright (C) 2001 0020 * the Initial Developer. All Rights Reserved. 0021 * 0022 * Contributor(s): 0023 * Shy Shalom <shooshX@gmail.com> 0024 * 0025 * Alternatively, the contents of this file may be used under the terms of 0026 * either the GNU General Public License Version 2 or later (the "GPL"), or 0027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 0028 * in which case the provisions of the GPL or the LGPL are applicable instead 0029 * of those above. If you wish to allow use of your version of this file only 0030 * under the terms of either the GPL or the LGPL, and not to allow others to 0031 * use your version of this file under the terms of the MPL, indicate your 0032 * decision by deleting the provisions above and replace them with the notice 0033 * and other provisions required by the GPL or the LGPL. If you do not delete 0034 * the provisions above, a recipient may use your version of this file under 0035 * the terms of any one of the MPL, the GPL or the LGPL. 0036 * 0037 * ***** END LICENSE BLOCK ***** */ 0038 0039 #pragma GCC visibility push(hidden) 0040 0041 #include "nsLatin1Prober.h" 0042 #include "prmem.h" 0043 #include <stdio.h> 0044 0045 #define UDF 0 // undefined 0046 #define OTH 1 //other 0047 #define ASC 2 // ascii capital letter 0048 #define ASS 3 // ascii small letter 0049 #define ACV 4 // accent capital vowel 0050 #define ACO 5 // accent capital other 0051 #define ASV 6 // accent small vowel 0052 #define ASO 7 // accent small other 0053 #define CLASS_NUM 8 // total classes 0054 0055 static unsigned char Latin1_CharToClass[] = 0056 { 0057 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 0058 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F 0059 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 0060 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F 0061 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 0062 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F 0063 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 0064 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F 0065 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 0066 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F 0067 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 0068 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F 0069 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 0070 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F 0071 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 0072 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F 0073 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 0074 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F 0075 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 0076 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F 0077 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 0078 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF 0079 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 0080 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF 0081 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 0082 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF 0083 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 0084 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF 0085 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 0086 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF 0087 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 0088 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF 0089 }; 0090 0091 0092 /* 0 : illegal 0093 1 : very unlikely 0094 2 : normal 0095 3 : very likely 0096 */ 0097 static unsigned char Latin1ClassModel[] = 0098 { 0099 /* UDF OTH ASC ASS ACV ACO ASV ASO */ 0100 /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, 0101 /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, 0102 /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, 0103 /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, 0104 /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, 0105 /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, 0106 /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, 0107 /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, 0108 }; 0109 0110 void nsLatin1Prober::Reset(void) 0111 { 0112 mState = eDetecting; 0113 mLastCharClass = OTH; 0114 for (int i = 0; i < FREQ_CAT_NUM; i++) 0115 mFreqCounter[i] = 0; 0116 } 0117 0118 0119 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen) 0120 { 0121 char *newBuf1 = 0; 0122 PRUint32 newLen1 = 0; 0123 0124 if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { 0125 newBuf1 = (char*)aBuf; 0126 newLen1 = aLen; 0127 } 0128 0129 unsigned char charClass; 0130 unsigned char freq; 0131 for (PRUint32 i = 0; i < newLen1; i++) 0132 { 0133 charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; 0134 freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; 0135 if (freq == 0) { 0136 mState = eNotMe; 0137 break; 0138 } 0139 mFreqCounter[freq]++; 0140 mLastCharClass = charClass; 0141 } 0142 0143 if (newBuf1 != aBuf) 0144 PR_FREEIF(newBuf1); 0145 0146 return mState; 0147 } 0148 0149 float nsLatin1Prober::GetConfidence(void) 0150 { 0151 if (mState == eNotMe) 0152 return 0.01f; 0153 0154 float confidence; 0155 PRUint32 total = 0; 0156 for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++) 0157 total += mFreqCounter[i]; 0158 0159 if(!total) 0160 confidence = 0.0f; 0161 else 0162 { 0163 confidence = mFreqCounter[3]*1.0f / total; 0164 confidence -= mFreqCounter[1]*20.0f/total; 0165 } 0166 0167 if (confidence < 0.0f) 0168 confidence = 0.0f; 0169 0170 // lower the confidence of latin1 so that other more accurate detector 0171 // can take priority. 0172 confidence *= 0.50f; 0173 0174 return confidence; 0175 } 0176 0177 #ifdef DEBUG_chardet 0178 void nsLatin1Prober::DumpStatus() 0179 { 0180 printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); 0181 } 0182 #endif 0183 0184 #pragma GCC visibility pop 0185