File indexing completed on 2024-04-28 03:53:03
0001 /* -*- C++ -*- 0002 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> 0003 0004 SPDX-License-Identifier: MIT 0005 */ 0006 0007 #include "nsCharSetProber.h" 0008 0009 #include <stdlib.h> 0010 0011 namespace kencodingprober 0012 { 0013 // This filter applies to all scripts which do not use English characters 0014 bool nsCharSetProber::FilterWithoutEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen) 0015 { 0016 char *newptr; 0017 char *prevPtr; 0018 char *curPtr; 0019 0020 bool meetMSB = false; 0021 newptr = *newBuf = (char *)malloc(aLen); 0022 if (!newptr) { 0023 return false; 0024 } 0025 0026 for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) { 0027 if (*curPtr & 0x80) { 0028 meetMSB = true; 0029 } else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') { 0030 // current char is a symbol, most likely a punctuation. we treat it as segment delimiter 0031 if (meetMSB && curPtr > prevPtr) 0032 // this segment contains more than single symbol, and it has upper ASCII, we need to keep it 0033 { 0034 while (prevPtr < curPtr) { 0035 *newptr++ = *prevPtr++; 0036 } 0037 prevPtr++; 0038 *newptr++ = ' '; 0039 meetMSB = false; 0040 } else { // ignore current segment. (either because it is just a symbol or just an English word) 0041 prevPtr = curPtr + 1; 0042 } 0043 } 0044 } 0045 if (meetMSB && curPtr > prevPtr) { 0046 while (prevPtr < curPtr) { 0047 *newptr++ = *prevPtr++; 0048 } 0049 } 0050 0051 newLen = newptr - *newBuf; 0052 0053 return true; 0054 } 0055 0056 // This filter applies to all scripts which contain both English characters and upper ASCII characters. 0057 bool nsCharSetProber::FilterWithEnglishLetters(const char *aBuf, unsigned int aLen, char **newBuf, unsigned int &newLen) 0058 { 0059 // do filtering to reduce load to probers 0060 char *newptr; 0061 char *prevPtr; 0062 char *curPtr; 0063 bool isInTag = false; 0064 0065 newptr = *newBuf = (char *)malloc(aLen); 0066 if (!newptr) { 0067 return false; 0068 } 0069 0070 for (curPtr = prevPtr = (char *)aBuf; curPtr < aBuf + aLen; ++curPtr) { 0071 if (*curPtr == '>') { 0072 isInTag = false; 0073 } else if (*curPtr == '<') { 0074 isInTag = true; 0075 } 0076 0077 if (!(*curPtr & 0x80) // 0078 && (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')) { 0079 if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 0080 // and it is not inside a tag, keep it. 0081 { 0082 while (prevPtr < curPtr) { 0083 *newptr++ = *prevPtr++; 0084 } 0085 prevPtr++; 0086 *newptr++ = ' '; 0087 } else { 0088 prevPtr = curPtr + 1; 0089 } 0090 } 0091 } 0092 0093 // If the current segment contains more than just a symbol 0094 // and it is not inside a tag then keep it. 0095 if (!isInTag) { 0096 while (prevPtr < curPtr) { 0097 *newptr++ = *prevPtr++; 0098 } 0099 } 0100 0101 newLen = newptr - *newBuf; 0102 0103 return true; 0104 } 0105 }