src/probers/nsHebrewProber.cpp

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003
0004     SPDX-License-Identifier: MIT
0005 */
0006
0007 #include "nsHebrewProber.h"
0008 #include <stdio.h>
0009
0010 // windows-1255 / ISO-8859-8 code points of interest
0011 #define FINAL_KAF ('\xea')
0012 #define NORMAL_KAF ('\xeb')
0013 #define FINAL_MEM ('\xed')
0014 #define NORMAL_MEM ('\xee')
0015 #define FINAL_NUN ('\xef')
0016 #define NORMAL_NUN ('\xf0')
0017 #define FINAL_PE ('\xf3')
0018 #define NORMAL_PE ('\xf4')
0019 #define FINAL_TSADI ('\xf5')
0020 #define NORMAL_TSADI ('\xf6')
0021
0022 // Minimum Visual vs Logical final letter score difference.
0023 // If the difference is below this, don't rely solely on the final letter score distance.
0024 #define MIN_FINAL_CHAR_DISTANCE (5)
0025
0026 // Minimum Visual vs Logical model score difference.
0027 // If the difference is below this, don't rely at all on the model score distance.
0028 #define MIN_MODEL_DISTANCE (0.01)
0029
0030 #define VISUAL_HEBREW_NAME ("ISO-8859-8")
0031 #define LOGICAL_HEBREW_NAME ("windows-1255")
0032
0033 namespace kencodingprober
0034 {
0035 bool nsHebrewProber::isFinal(char c)
0036 {
0037     return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
0038 }
0039
0040 bool nsHebrewProber::isNonFinal(char c)
0041 {
0042     return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
0043     // The normal Tsadi is not a good Non-Final letter due to words like
0044     // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
0045     // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
0046     // the Non-Final tsadi to appear at an end of a word even though this is not
0047     // the case in the original text.
0048     // The letters Pe and Kaf rarely display a related behavior of not being a
0049     // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
0050     // example legally end with a Non-Final Pe or Kaf. However, the benefit of
0051     // these letters as Non-Final letters outweighs the damage since these words
0052     // are quite rare.
0053 }
0054
0055 /** HandleData
0056  * Final letter analysis for logical-visual decision.
0057  * Look for evidence that the received buffer is either logical Hebrew or
0058  * visual Hebrew.
0059  * The following cases are checked:
0060  * 1) A word longer than 1 letter, ending with a final letter. This is an
0061  *    indication that the text is laid out "naturally" since the final letter
0062  *    really appears at the end. +1 for logical score.
0063  * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
0064  *    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
0065  *    the Non-Final form of that letter. Exceptions to this rule are mentioned
0066  *    above in isNonFinal(). This is an indication that the text is laid out
0067  *    backwards. +1 for visual score
0068  * 3) A word longer than 1 letter, starting with a final letter. Final letters
0069  *    should not appear at the beginning of a word. This is an indication that
0070  *    the text is laid out backwards. +1 for visual score.
0071  *
0072  * The visual score and logical score are accumulated throughout the text and
0073  * are finally checked against each other in GetCharSetName().
0074  * No checking for final letters in the middle of words is done since that case
0075  * is not an indication for either Logical or Visual text.
0076  *
0077  * The input buffer should not contain any white spaces that are not (' ')
0078  * or any low-ascii punctuation marks.
0079  */
0080 nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen)
0081 {
0082     // Both model probers say it's not them. No reason to continue.
0083     if (GetState() == eNotMe) {
0084         return eNotMe;
0085     }
0086
0087     const char *curPtr;
0088     const char *endPtr = aBuf + aLen;
0089
0090     for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) {
0091         char cur = *curPtr;
0092         if (cur == ' ') { // We stand on a space - a word just ended
0093             if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word
0094                 if (isFinal(mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space]
0095                     ++mFinalCharLogicalScore;
0096                 } else if (isNonFinal(mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space]
0097                     ++mFinalCharVisualScore;
0098                 }
0099             }
0100         } else { // Not standing on a space
0101             if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space]
0102                 ++mFinalCharVisualScore;
0103             }
0104         }
0105         mBeforePrev = mPrev;
0106         mPrev = cur;
0107     }
0108
0109     // Forever detecting, till the end or until both model probers return eNotMe (handled above).
0110     return eDetecting;
0111 }
0112
0113 // Make the decision: is it Logical or Visual?
0114 const char *nsHebrewProber::GetCharSetName()
0115 {
0116     // If the final letter score distance is dominant enough, rely on it.
0117     int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
0118     if (finalsub >= MIN_FINAL_CHAR_DISTANCE) {
0119         return LOGICAL_HEBREW_NAME;
0120     }
0121     if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) {
0122         return VISUAL_HEBREW_NAME;
0123     }
0124
0125     // It's not dominant enough, try to rely on the model scores instead.
0126     float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
0127     if (modelsub > MIN_MODEL_DISTANCE) {
0128         return LOGICAL_HEBREW_NAME;
0129     }
0130     if (modelsub < -(MIN_MODEL_DISTANCE)) {
0131         return VISUAL_HEBREW_NAME;
0132     }
0133
0134     // Still no good, back to final letter distance, maybe it'll save the day.
0135     if (finalsub < 0) {
0136         return VISUAL_HEBREW_NAME;
0137     }
0138
0139     // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
0140     return LOGICAL_HEBREW_NAME;
0141 }
0142
0143 void nsHebrewProber::Reset(void)
0144 {
0145     mFinalCharLogicalScore = 0;
0146     mFinalCharVisualScore = 0;
0147
0148     // mPrev and mBeforePrev are initialized to space in order to simulate a word
0149     // delimiter at the beginning of the data
0150     mPrev = ' ';
0151     mBeforePrev = ' ';
0152 }
0153
0154 nsProbingState nsHebrewProber::GetState(void)
0155 {
0156     // Remain active as long as any of the model probers are active.
0157     if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) {
0158         return eNotMe;
0159     }
0160     return eDetecting;
0161 }
0162
0163 #ifdef DEBUG_PROBE
0164 void nsHebrewProber::DumpStatus()
0165 {
0166     printf("  HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
0167 }
0168 #endif
0169 }