File indexing completed on 2024-12-22 04:33:41
0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 0002 /* ***** BEGIN LICENSE BLOCK ***** 0003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 0004 * 0005 * The contents of this file are subject to the Mozilla Public License Version 0006 * 1.1 (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * http://www.mozilla.org/MPL/ 0009 * 0010 * Software distributed under the License is distributed on an "AS IS" basis, 0011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 0012 * for the specific language governing rights and limitations under the 0013 * License. 0014 * 0015 * The Original Code is Mozilla Universal charset detector code. 0016 * 0017 * The Initial Developer of the Original Code is 0018 * Shy Shalom <shooshX@gmail.com> 0019 * Portions created by the Initial Developer are Copyright (C) 2005 0020 * the Initial Developer. All Rights Reserved. 0021 * 0022 * Contributor(s): 0023 * 0024 * Alternatively, the contents of this file may be used under the terms of 0025 * either the GNU General Public License Version 2 or later (the "GPL"), or 0026 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 0027 * in which case the provisions of the GPL or the LGPL are applicable instead 0028 * of those above. If you wish to allow use of your version of this file only 0029 * under the terms of either the GPL or the LGPL, and not to allow others to 0030 * use your version of this file under the terms of the MPL, indicate your 0031 * decision by deleting the provisions above and replace them with the notice 0032 * and other provisions required by the GPL or the LGPL. If you do not delete 0033 * the provisions above, a recipient may use your version of this file under 0034 * the terms of any one of the MPL, the GPL or the LGPL. 0035 * 0036 * ***** END LICENSE BLOCK ***** */ 0037 0038 #pragma GCC visibility push(hidden) 0039 0040 #include "nsHebrewProber.h" 0041 #include <stdio.h> 0042 0043 // windows-1255 / ISO-8859-8 code points of interest 0044 #define FINAL_KAF ('\xea') 0045 #define NORMAL_KAF ('\xeb') 0046 #define FINAL_MEM ('\xed') 0047 #define NORMAL_MEM ('\xee') 0048 #define FINAL_NUN ('\xef') 0049 #define NORMAL_NUN ('\xf0') 0050 #define FINAL_PE ('\xf3') 0051 #define NORMAL_PE ('\xf4') 0052 #define FINAL_TSADI ('\xf5') 0053 #define NORMAL_TSADI ('\xf6') 0054 0055 // Minimum Visual vs Logical final letter score difference. 0056 // If the difference is below this, don't rely solely on the final letter score distance. 0057 #define MIN_FINAL_CHAR_DISTANCE (5) 0058 0059 // Minimum Visual vs Logical model score difference. 0060 // If the difference is below this, don't rely at all on the model score distance. 0061 #define MIN_MODEL_DISTANCE (0.01) 0062 0063 #define VISUAL_HEBREW_NAME ("ISO-8859-8") 0064 #define LOGICAL_HEBREW_NAME ("windows-1255") 0065 0066 PRBool nsHebrewProber::isFinal(char c) 0067 { 0068 return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 0069 } 0070 0071 PRBool nsHebrewProber::isNonFinal(char c) 0072 { 0073 return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 0074 // The normal Tsadi is not a good Non-Final letter due to words like 0075 // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 0076 // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 0077 // the Non-Final tsadi to appear at an end of a word even though this is not 0078 // the case in the original text. 0079 // The letters Pe and Kaf rarely display a related behavior of not being a 0080 // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 0081 // example legally end with a Non-Final Pe or Kaf. However, the benefit of 0082 // these letters as Non-Final letters outweighs the damage since these words 0083 // are quite rare. 0084 } 0085 0086 /** HandleData 0087 * Final letter analysis for logical-visual decision. 0088 * Look for evidence that the received buffer is either logical Hebrew or 0089 * visual Hebrew. 0090 * The following cases are checked: 0091 * 1) A word longer than 1 letter, ending with a final letter. This is an 0092 * indication that the text is laid out "naturally" since the final letter 0093 * really appears at the end. +1 for logical score. 0094 * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal 0095 * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with 0096 * the Non-Final form of that letter. Exceptions to this rule are mentioned 0097 * above in isNonFinal(). This is an indication that the text is laid out 0098 * backwards. +1 for visual score 0099 * 3) A word longer than 1 letter, starting with a final letter. Final letters 0100 * should not appear at the beginning of a word. This is an indication that 0101 * the text is laid out backwards. +1 for visual score. 0102 * 0103 * The visual score and logical score are accumulated throughout the text and 0104 * are finally checked against each other in GetCharSetName(). 0105 * No checking for final letters in the middle of words is done since that case 0106 * is not an indication for either Logical or Visual text. 0107 * 0108 * The input buffer should not contain any white spaces that are not (' ') 0109 * or any low-ascii punctuation marks. 0110 */ 0111 nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen) 0112 { 0113 // Both model probers say it's not them. No reason to continue. 0114 if (GetState() == eNotMe) 0115 return eNotMe; 0116 0117 const char *curPtr, *endPtr = aBuf+aLen; 0118 char cur; 0119 0120 for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) 0121 { 0122 cur = *curPtr; 0123 if (cur == ' ') // We stand on a space - a word just ended 0124 { 0125 if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word 0126 { 0127 if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] 0128 ++mFinalCharLogicalScore; 0129 else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] 0130 ++mFinalCharVisualScore; 0131 } 0132 } 0133 else // Not standing on a space 0134 { 0135 if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] 0136 ++mFinalCharVisualScore; 0137 } 0138 mBeforePrev = mPrev; 0139 mPrev = cur; 0140 } 0141 0142 // Forever detecting, till the end or until both model probers return eNotMe (handled above). 0143 return eDetecting; 0144 } 0145 0146 // Make the decision: is it Logical or Visual? 0147 const char* nsHebrewProber::GetCharSetName() 0148 { 0149 // If the final letter score distance is dominant enough, rely on it. 0150 PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 0151 if (finalsub >= MIN_FINAL_CHAR_DISTANCE) 0152 return LOGICAL_HEBREW_NAME; 0153 if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) 0154 return VISUAL_HEBREW_NAME; 0155 0156 // It's not dominant enough, try to rely on the model scores instead. 0157 float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 0158 if (modelsub > MIN_MODEL_DISTANCE) 0159 return LOGICAL_HEBREW_NAME; 0160 if (modelsub < -(MIN_MODEL_DISTANCE)) 0161 return VISUAL_HEBREW_NAME; 0162 0163 // Still no good, back to final letter distance, maybe it'll save the day. 0164 if (finalsub < 0) 0165 return VISUAL_HEBREW_NAME; 0166 0167 // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 0168 return LOGICAL_HEBREW_NAME; 0169 } 0170 0171 0172 void nsHebrewProber::Reset(void) 0173 { 0174 mFinalCharLogicalScore = 0; 0175 mFinalCharVisualScore = 0; 0176 0177 // mPrev and mBeforePrev are initialized to space in order to simulate a word 0178 // delimiter at the beginning of the data 0179 mPrev = ' '; 0180 mBeforePrev = ' '; 0181 } 0182 0183 nsProbingState nsHebrewProber::GetState(void) 0184 { 0185 // Remain active as long as any of the model probers are active. 0186 if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) 0187 return eNotMe; 0188 return eDetecting; 0189 } 0190 0191 #ifdef DEBUG_chardet 0192 void nsHebrewProber::DumpStatus() 0193 { 0194 printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 0195 } 0196 #endif 0197 0198 #pragma GCC visibility pop 0199