File indexing completed on 2024-04-28 03:53:03
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsHebrewProber.h" 0008 #include <stdio.h> 0009 0010 // windows-1255 / ISO-8859-8 code points of interest 0011 #define FINAL_KAF ('\xea') 0012 #define NORMAL_KAF ('\xeb') 0013 #define FINAL_MEM ('\xed') 0014 #define NORMAL_MEM ('\xee') 0015 #define FINAL_NUN ('\xef') 0016 #define NORMAL_NUN ('\xf0') 0017 #define FINAL_PE ('\xf3') 0018 #define NORMAL_PE ('\xf4') 0019 #define FINAL_TSADI ('\xf5') 0020 #define NORMAL_TSADI ('\xf6') 0021 0022 // Minimum Visual vs Logical final letter score difference. 0023 // If the difference is below this, don't rely solely on the final letter score distance. 0024 #define MIN_FINAL_CHAR_DISTANCE (5) 0025 0026 // Minimum Visual vs Logical model score difference. 0027 // If the difference is below this, don't rely at all on the model score distance. 0028 #define MIN_MODEL_DISTANCE (0.01) 0029 0030 #define VISUAL_HEBREW_NAME ("ISO-8859-8") 0031 #define LOGICAL_HEBREW_NAME ("windows-1255") 0032 0033 namespace kencodingprober 0034 { 0035 bool nsHebrewProber::isFinal(char c) 0036 { 0037 return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 0038 } 0039 0040 bool nsHebrewProber::isNonFinal(char c) 0041 { 0042 return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 0043 // The normal Tsadi is not a good Non-Final letter due to words like 0044 // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 0045 // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 0046 // the Non-Final tsadi to appear at an end of a word even though this is not 0047 // the case in the original text. 0048 // The letters Pe and Kaf rarely display a related behavior of not being a 0049 // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 0050 // example legally end with a Non-Final Pe or Kaf. However, the benefit of 0051 // these letters as Non-Final letters outweighs the damage since these words 0052 // are quite rare. 0053 } 0054 0055 /** HandleData 0056 * Final letter analysis for logical-visual decision. 0057 * Look for evidence that the received buffer is either logical Hebrew or 0058 * visual Hebrew. 0059 * The following cases are checked: 0060 * 1) A word longer than 1 letter, ending with a final letter. This is an 0061 * indication that the text is laid out "naturally" since the final letter 0062 * really appears at the end. +1 for logical score. 0063 * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal 0064 * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with 0065 * the Non-Final form of that letter. Exceptions to this rule are mentioned 0066 * above in isNonFinal(). This is an indication that the text is laid out 0067 * backwards. +1 for visual score 0068 * 3) A word longer than 1 letter, starting with a final letter. Final letters 0069 * should not appear at the beginning of a word. This is an indication that 0070 * the text is laid out backwards. +1 for visual score. 0071 * 0072 * The visual score and logical score are accumulated throughout the text and 0073 * are finally checked against each other in GetCharSetName(). 0074 * No checking for final letters in the middle of words is done since that case 0075 * is not an indication for either Logical or Visual text. 0076 * 0077 * The input buffer should not contain any white spaces that are not (' ') 0078 * or any low-ascii punctuation marks. 0079 */ 0080 nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen) 0081 { 0082 // Both model probers say it's not them. No reason to continue. 0083 if (GetState() == eNotMe) { 0084 return eNotMe; 0085 } 0086 0087 const char *curPtr; 0088 const char *endPtr = aBuf + aLen; 0089 0090 for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) { 0091 char cur = *curPtr; 0092 if (cur == ' ') { // We stand on a space - a word just ended 0093 if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word 0094 if (isFinal(mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space] 0095 ++mFinalCharLogicalScore; 0096 } else if (isNonFinal(mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] 0097 ++mFinalCharVisualScore; 0098 } 0099 } 0100 } else { // Not standing on a space 0101 if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space] 0102 ++mFinalCharVisualScore; 0103 } 0104 } 0105 mBeforePrev = mPrev; 0106 mPrev = cur; 0107 } 0108 0109 // Forever detecting, till the end or until both model probers return eNotMe (handled above). 0110 return eDetecting; 0111 } 0112 0113 // Make the decision: is it Logical or Visual? 0114 const char *nsHebrewProber::GetCharSetName() 0115 { 0116 // If the final letter score distance is dominant enough, rely on it. 0117 int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 0118 if (finalsub >= MIN_FINAL_CHAR_DISTANCE) { 0119 return LOGICAL_HEBREW_NAME; 0120 } 0121 if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) { 0122 return VISUAL_HEBREW_NAME; 0123 } 0124 0125 // It's not dominant enough, try to rely on the model scores instead. 0126 float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 0127 if (modelsub > MIN_MODEL_DISTANCE) { 0128 return LOGICAL_HEBREW_NAME; 0129 } 0130 if (modelsub < -(MIN_MODEL_DISTANCE)) { 0131 return VISUAL_HEBREW_NAME; 0132 } 0133 0134 // Still no good, back to final letter distance, maybe it'll save the day. 0135 if (finalsub < 0) { 0136 return VISUAL_HEBREW_NAME; 0137 } 0138 0139 // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 0140 return LOGICAL_HEBREW_NAME; 0141 } 0142 0143 void nsHebrewProber::Reset(void) 0144 { 0145 mFinalCharLogicalScore = 0; 0146 mFinalCharVisualScore = 0; 0147 0148 // mPrev and mBeforePrev are initialized to space in order to simulate a word 0149 // delimiter at the beginning of the data 0150 mPrev = ' '; 0151 mBeforePrev = ' '; 0152 } 0153 0154 nsProbingState nsHebrewProber::GetState(void) 0155 { 0156 // Remain active as long as any of the model probers are active. 0157 if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) { 0158 return eNotMe; 0159 } 0160 return eDetecting; 0161 } 0162 0163 #ifdef DEBUG_PROBE 0164 void nsHebrewProber::DumpStatus() 0165 { 0166 printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 0167 } 0168 #endif 0169 }