src/probers/nsHebrewProber.cpp

0001 /*  -*- C++ -*-
0002     SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
0003
0004     SPDX-License-Identifier: MIT
0005 */
0006
0007 #include "nsHebrewProber.h"
0008 #include <stdio.h>
0009
0010 // windows-1255 / ISO-8859-8 code points of interest
0011 #define FINAL_KAF ('\xea')
0012 #define NORMAL_KAF ('\xeb')
0013 #define FINAL_MEM ('\xed')
0014 #define NORMAL_MEM ('\xee')
0015 #define FINAL_NUN ('\xef')
0016 #define NORMAL_NUN ('\xf0')
0017 #define FINAL_PE ('\xf3')
0018 #define NORMAL_PE ('\xf4')
0019 #define FINAL_TSADI ('\xf5')
0020 #define NORMAL_TSADI ('\xf6')
0021
0022 // Minimum Visual vs Logical final letter score difference.
0023 // If the difference is below this, don't rely solely on the final letter score distance.
0024 #define MIN_FINAL_CHAR_DISTANCE (5)
0025
0026 // Minimum Visual vs Logical model score difference.
0027 // If the difference is below this, don't rely at all on the model score distance.
0028 #define MIN_MODEL_DISTANCE (0.01)
0029
0030 #define VISUAL_HEBREW_NAME ("ISO-8859-8")
0031 #define LOGICAL_HEBREW_NAME ("windows-1255")
0032
0033 namespace kencodingprober
0034 {
0035 bool nsHebrewProber::isFinal(char c)
0036 {
0037     return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
0038 }
0039
0040 bool nsHebrewProber::isNonFinal(char c)
0041 {
0042     return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
0043     // The normal Tsadi is not a good Non-Final letter due to words like
0044     // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
0045     // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
0046     // the Non-Final tsadi to appear at an end of a word even though this is not
0047     // the case in the original text.
0048     // The letters Pe and Kaf rarely display a related behavior of not being a
0049     // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
0050     // example legally end with a Non-Final Pe or Kaf. However, the benefit of
0051     // these letters as Non-Final letters outweighs the damage since these words
0052     // are quite rare.
0053 }
0054
0055 /** HandleData
0056  * Final letter analysis for logical-visual decision.
0057  * Look for evidence that the received buffer is either logical Hebrew or
0058  * visual Hebrew.
0059  * The following cases are checked:
0060  * 1) A word longer than 1 letter, ending with a final letter. This is an
0061  *    indication that the text is laid out "naturally" since the final letter
0062  *    really appears at the end. +1 for logical score.
0063  * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
0064  *    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
0065  *    the Non-Final form of that letter. Exceptions to this rule are mentioned
0066  *    above in isNonFinal(). This is an indication that the text is laid out
0067  *    backwards. +1 for visual score
0068  * 3) A word longer than 1 letter, starting with a final letter. Final letters
0069  *    should not appear at the beginning of a word. This is an indication that
0070  *    the text is laid out backwards. +1 for visual score.
0071  *
0072  * The visual score and logical score are accumulated throughout the text and
0073  * are finally checked against each other in GetCharSetName().
0074  * No checking for final letters in the middle of words is done since that case
0075  * is not an indication for either Logical or Visual text.
0076  *
0077  * The input buffer should not contain any white spaces that are not (' ')
0078  * or any low-ascii punctuation marks.
0079  */
0080 nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen)
0081 {
0082     // Both model probers say it's not them. No reason to continue.
0083     if (GetState() == eNotMe) {
0084         return eNotMe;
0085     }
0086
0087     const char *curPtr;
0088     const char *endPtr = aBuf + aLen;
0089     char cur;
0090
0091     for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) {
0092         cur = *curPtr;
0093         if (cur == ' ') { // We stand on a space - a word just ended
0094             if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word
0095                 if (isFinal(mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space]
0096                     ++mFinalCharLogicalScore;
0097                 } else if (isNonFinal(mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space]
0098                     ++mFinalCharVisualScore;
0099                 }
0100             }
0101         } else { // Not standing on a space
0102             if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space]
0103                 ++mFinalCharVisualScore;
0104             }
0105         }
0106         mBeforePrev = mPrev;
0107         mPrev = cur;
0108     }
0109
0110     // Forever detecting, till the end or until both model probers return eNotMe (handled above).
0111     return eDetecting;
0112 }
0113
0114 // Make the decision: is it Logical or Visual?
0115 const char *nsHebrewProber::GetCharSetName()
0116 {
0117     // If the final letter score distance is dominant enough, rely on it.
0118     int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
0119     if (finalsub >= MIN_FINAL_CHAR_DISTANCE) {
0120         return LOGICAL_HEBREW_NAME;
0121     }
0122     if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) {
0123         return VISUAL_HEBREW_NAME;
0124     }
0125
0126     // It's not dominant enough, try to rely on the model scores instead.
0127     float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
0128     if (modelsub > MIN_MODEL_DISTANCE) {
0129         return LOGICAL_HEBREW_NAME;
0130     }
0131     if (modelsub < -(MIN_MODEL_DISTANCE)) {
0132         return VISUAL_HEBREW_NAME;
0133     }
0134
0135     // Still no good, back to final letter distance, maybe it'll save the day.
0136     if (finalsub < 0) {
0137         return VISUAL_HEBREW_NAME;
0138     }
0139
0140     // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
0141     return LOGICAL_HEBREW_NAME;
0142 }
0143
0144 void nsHebrewProber::Reset(void)
0145 {
0146     mFinalCharLogicalScore = 0;
0147     mFinalCharVisualScore = 0;
0148
0149     // mPrev and mBeforePrev are initialized to space in order to simulate a word
0150     // delimiter at the beginning of the data
0151     mPrev = ' ';
0152     mBeforePrev = ' ';
0153 }
0154
0155 nsProbingState nsHebrewProber::GetState(void)
0156 {
0157     // Remain active as long as any of the model probers are active.
0158     if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) {
0159         return eNotMe;
0160     }
0161     return eDetecting;
0162 }
0163
0164 #ifdef DEBUG_PROBE
0165 void nsHebrewProber::DumpStatus()
0166 {
0167     printf("  HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
0168 }
0169 #endif
0170 }