File indexing completed on 2025-01-05 04:27:35

0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
0002 /* ***** BEGIN LICENSE BLOCK *****
0003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
0004  *
0005  * The contents of this file are subject to the Mozilla Public License Version
0006  * 1.1 (the "License"); you may not use this file except in compliance with
0007  * the License. You may obtain a copy of the License at
0008  * http://www.mozilla.org/MPL/
0009  *
0010  * Software distributed under the License is distributed on an "AS IS" basis,
0011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
0012  * for the specific language governing rights and limitations under the
0013  * License.
0014  *
0015  * The Original Code is Mozilla Universal charset detector code.
0016  *
0017  * The Initial Developer of the Original Code is
0018  * Netscape Communications Corporation.
0019  * Portions created by the Initial Developer are Copyright (C) 2001
0020  * the Initial Developer. All Rights Reserved.
0021  *
0022  * Contributor(s):
0023  *          Shy Shalom <shooshX@gmail.com>
0024  *
0025  * Alternatively, the contents of this file may be used under the terms of
0026  * either the GNU General Public License Version 2 or later (the "GPL"), or
0027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
0028  * in which case the provisions of the GPL or the LGPL are applicable instead
0029  * of those above. If you wish to allow use of your version of this file only
0030  * under the terms of either the GPL or the LGPL, and not to allow others to
0031  * use your version of this file under the terms of the MPL, indicate your
0032  * decision by deleting the provisions above and replace them with the notice
0033  * and other provisions required by the GPL or the LGPL. If you do not delete
0034  * the provisions above, a recipient may use your version of this file under
0035  * the terms of any one of the MPL, the GPL or the LGPL.
0036  *
0037  * ***** END LICENSE BLOCK ***** */
0038 
0039 #pragma GCC visibility push(hidden)
0040 
0041 #include "nsLatin1Prober.h"
0042 #include "prmem.h"
0043 #include <stdio.h>
0044 
0045 #define UDF    0        // undefined
0046 #define OTH    1        //other
0047 #define ASC    2        // ascii capital letter
0048 #define ASS    3        // ascii small letter
0049 #define ACV    4        // accent capital vowel
0050 #define ACO    5        // accent capital other
0051 #define ASV    6        // accent small vowel
0052 #define ASO    7        // accent small other
0053 #define CLASS_NUM   8    // total classes
0054 
0055 static unsigned char Latin1_CharToClass[] = 
0056 {
0057   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
0058   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
0059   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17
0060   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F
0061   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27
0062   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F
0063   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37
0064   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F
0065   OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47
0066   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F
0067   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57
0068   ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F
0069   OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67
0070   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F
0071   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77
0072   ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F
0073   OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87
0074   OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F
0075   UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97
0076   OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F
0077   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7
0078   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF
0079   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7
0080   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF
0081   ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7
0082   ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF
0083   ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7
0084   ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF
0085   ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7
0086   ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF
0087   ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7
0088   ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF
0089 };
0090 
0091 
0092 /* 0 : illegal 
0093    1 : very unlikely 
0094    2 : normal 
0095    3 : very likely
0096 */
0097 static unsigned char Latin1ClassModel[] = 
0098 {
0099 /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
0100 /*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
0101 /*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,
0102 /*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, 
0103 /*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,
0104 /*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,
0105 /*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, 
0106 /*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, 
0107 /*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,
0108 };
0109 
0110 void  nsLatin1Prober::Reset(void)
0111 {
0112   mState = eDetecting;
0113   mLastCharClass = OTH;
0114   for (int i = 0; i < FREQ_CAT_NUM; i++)
0115     mFreqCounter[i] = 0;
0116 }
0117 
0118 
0119 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
0120 {
0121   char *newBuf1 = 0;
0122   PRUint32 newLen1 = 0;
0123 
0124   if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
0125     newBuf1 = (char*)aBuf;
0126     newLen1 = aLen;
0127   }
0128   
0129   unsigned char charClass;
0130   unsigned char freq;
0131   for (PRUint32 i = 0; i < newLen1; i++)
0132   {
0133     charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
0134     freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
0135     if (freq == 0) {
0136       mState = eNotMe;
0137       break;
0138     }
0139     mFreqCounter[freq]++;
0140     mLastCharClass = charClass;
0141   }
0142 
0143   if (newBuf1 != aBuf)
0144     PR_FREEIF(newBuf1);
0145 
0146   return mState;
0147 }
0148 
0149 float nsLatin1Prober::GetConfidence(void)
0150 {
0151   if (mState == eNotMe)
0152     return 0.01f;
0153   
0154   float confidence;
0155   PRUint32 total = 0;
0156   for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
0157     total += mFreqCounter[i];
0158 
0159   if(!total)
0160     confidence = 0.0f;
0161   else
0162   {
0163     confidence = mFreqCounter[3]*1.0f / total;
0164     confidence -= mFreqCounter[1]*20.0f/total;
0165   }
0166 
0167   if (confidence < 0.0f)
0168     confidence = 0.0f;
0169   
0170   // lower the confidence of latin1 so that other more accurate detector 
0171   // can take priority.
0172   confidence *= 0.50f;
0173 
0174   return confidence;
0175 }
0176 
0177 #ifdef DEBUG_chardet
0178 void  nsLatin1Prober::DumpStatus()
0179 {
0180   printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
0181 }
0182 #endif
0183 
0184 #pragma GCC visibility pop
0185