File indexing completed on 2024-05-05 16:05:51
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsHebrewProber.h" 0008 #include <stdio.h> 0009 0010 // windows-1255 / ISO-8859-8 code points of interest 0011 #define FINAL_KAF ('\xea') 0012 #define NORMAL_KAF ('\xeb') 0013 #define FINAL_MEM ('\xed') 0014 #define NORMAL_MEM ('\xee') 0015 #define FINAL_NUN ('\xef') 0016 #define NORMAL_NUN ('\xf0') 0017 #define FINAL_PE ('\xf3') 0018 #define NORMAL_PE ('\xf4') 0019 #define FINAL_TSADI ('\xf5') 0020 #define NORMAL_TSADI ('\xf6') 0021 0022 // Minimum Visual vs Logical final letter score difference. 0023 // If the difference is below this, don't rely solely on the final letter score distance. 0024 #define MIN_FINAL_CHAR_DISTANCE (5) 0025 0026 // Minimum Visual vs Logical model score difference. 0027 // If the difference is below this, don't rely at all on the model score distance. 0028 #define MIN_MODEL_DISTANCE (0.01) 0029 0030 #define VISUAL_HEBREW_NAME ("ISO-8859-8") 0031 #define LOGICAL_HEBREW_NAME ("windows-1255") 0032 0033 namespace kencodingprober 0034 { 0035 bool nsHebrewProber::isFinal(char c) 0036 { 0037 return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); 0038 } 0039 0040 bool nsHebrewProber::isNonFinal(char c) 0041 { 0042 return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); 0043 // The normal Tsadi is not a good Non-Final letter due to words like 0044 // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 0045 // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 0046 // the Non-Final tsadi to appear at an end of a word even though this is not 0047 // the case in the original text. 0048 // The letters Pe and Kaf rarely display a related behavior of not being a 0049 // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 0050 // example legally end with a Non-Final Pe or Kaf. However, the benefit of 0051 // these letters as Non-Final letters outweighs the damage since these words 0052 // are quite rare. 0053 } 0054 0055 /** HandleData 0056 * Final letter analysis for logical-visual decision. 0057 * Look for evidence that the received buffer is either logical Hebrew or 0058 * visual Hebrew. 0059 * The following cases are checked: 0060 * 1) A word longer than 1 letter, ending with a final letter. This is an 0061 * indication that the text is laid out "naturally" since the final letter 0062 * really appears at the end. +1 for logical score. 0063 * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal 0064 * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with 0065 * the Non-Final form of that letter. Exceptions to this rule are mentioned 0066 * above in isNonFinal(). This is an indication that the text is laid out 0067 * backwards. +1 for visual score 0068 * 3) A word longer than 1 letter, starting with a final letter. Final letters 0069 * should not appear at the beginning of a word. This is an indication that 0070 * the text is laid out backwards. +1 for visual score. 0071 * 0072 * The visual score and logical score are accumulated throughout the text and 0073 * are finally checked against each other in GetCharSetName(). 0074 * No checking for final letters in the middle of words is done since that case 0075 * is not an indication for either Logical or Visual text. 0076 * 0077 * The input buffer should not contain any white spaces that are not (' ') 0078 * or any low-ascii punctuation marks. 0079 */ 0080 nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen) 0081 { 0082 // Both model probers say it's not them. No reason to continue. 0083 if (GetState() == eNotMe) { 0084 return eNotMe; 0085 } 0086 0087 const char *curPtr; 0088 const char *endPtr = aBuf + aLen; 0089 char cur; 0090 0091 for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) { 0092 cur = *curPtr; 0093 if (cur == ' ') { // We stand on a space - a word just ended 0094 if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word 0095 if (isFinal(mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space] 0096 ++mFinalCharLogicalScore; 0097 } else if (isNonFinal(mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] 0098 ++mFinalCharVisualScore; 0099 } 0100 } 0101 } else { // Not standing on a space 0102 if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space] 0103 ++mFinalCharVisualScore; 0104 } 0105 } 0106 mBeforePrev = mPrev; 0107 mPrev = cur; 0108 } 0109 0110 // Forever detecting, till the end or until both model probers return eNotMe (handled above). 0111 return eDetecting; 0112 } 0113 0114 // Make the decision: is it Logical or Visual? 0115 const char *nsHebrewProber::GetCharSetName() 0116 { 0117 // If the final letter score distance is dominant enough, rely on it. 0118 int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; 0119 if (finalsub >= MIN_FINAL_CHAR_DISTANCE) { 0120 return LOGICAL_HEBREW_NAME; 0121 } 0122 if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) { 0123 return VISUAL_HEBREW_NAME; 0124 } 0125 0126 // It's not dominant enough, try to rely on the model scores instead. 0127 float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); 0128 if (modelsub > MIN_MODEL_DISTANCE) { 0129 return LOGICAL_HEBREW_NAME; 0130 } 0131 if (modelsub < -(MIN_MODEL_DISTANCE)) { 0132 return VISUAL_HEBREW_NAME; 0133 } 0134 0135 // Still no good, back to final letter distance, maybe it'll save the day. 0136 if (finalsub < 0) { 0137 return VISUAL_HEBREW_NAME; 0138 } 0139 0140 // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. 0141 return LOGICAL_HEBREW_NAME; 0142 } 0143 0144 void nsHebrewProber::Reset(void) 0145 { 0146 mFinalCharLogicalScore = 0; 0147 mFinalCharVisualScore = 0; 0148 0149 // mPrev and mBeforePrev are initialized to space in order to simulate a word 0150 // delimiter at the beginning of the data 0151 mPrev = ' '; 0152 mBeforePrev = ' '; 0153 } 0154 0155 nsProbingState nsHebrewProber::GetState(void) 0156 { 0157 // Remain active as long as any of the model probers are active. 0158 if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) { 0159 return eNotMe; 0160 } 0161 return eDetecting; 0162 } 0163 0164 #ifdef DEBUG_PROBE 0165 void nsHebrewProber::DumpStatus() 0166 { 0167 printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); 0168 } 0169 #endif 0170 }