File indexing completed on 2024-06-02 04:53:37
0001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 0002 /* ***** BEGIN LICENSE BLOCK ***** 0003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 0004 * 0005 * The contents of this file are subject to the Mozilla Public License Version 0006 * 1.1 (the "License"); you may not use this file except in compliance with 0007 * the License. You may obtain a copy of the License at 0008 * http://www.mozilla.org/MPL/ 0009 * 0010 * Software distributed under the License is distributed on an "AS IS" basis, 0011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 0012 * for the specific language governing rights and limitations under the 0013 * License. 0014 * 0015 * The Original Code is Mozilla Universal charset detector code. 0016 * 0017 * The Initial Developer of the Original Code is 0018 * Netscape Communications Corporation. 0019 * Portions created by the Initial Developer are Copyright (C) 2001 0020 * the Initial Developer. All Rights Reserved. 0021 * 0022 * Contributor(s): 0023 * Shy Shalom <shooshX@gmail.com> 0024 * 0025 * Alternatively, the contents of this file may be used under the terms of 0026 * either the GNU General Public License Version 2 or later (the "GPL"), or 0027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 0028 * in which case the provisions of the GPL or the LGPL are applicable instead 0029 * of those above. If you wish to allow use of your version of this file only 0030 * under the terms of either the GPL or the LGPL, and not to allow others to 0031 * use your version of this file under the terms of the MPL, indicate your 0032 * decision by deleting the provisions above and replace them with the notice 0033 * and other provisions required by the GPL or the LGPL. If you do not delete 0034 * the provisions above, a recipient may use your version of this file under 0035 * the terms of any one of the MPL, the GPL or the LGPL. 0036 * 0037 * ***** END LICENSE BLOCK ***** */ 0038 0039 #pragma GCC visibility push(hidden) 0040 0041 #include "nsCharSetProber.h" 0042 #include "prmem.h" 0043 0044 //This filter applies to all scripts which do not use English characters 0045 PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 0046 { 0047 char *newptr; 0048 char *prevPtr, *curPtr; 0049 0050 PRBool meetMSB = PR_FALSE; 0051 newptr = *newBuf = (char*)PR_Malloc(aLen); 0052 if (!newptr) 0053 return PR_FALSE; 0054 0055 for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 0056 { 0057 if (*curPtr & 0x80) 0058 { 0059 meetMSB = PR_TRUE; 0060 } 0061 else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 0062 { 0063 //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 0064 if (meetMSB && curPtr > prevPtr) 0065 //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 0066 { 0067 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 0068 prevPtr++; 0069 *newptr++ = ' '; 0070 meetMSB = PR_FALSE; 0071 } 0072 else //ignore current segment. (either because it is just a symbol or just an English word) 0073 prevPtr = curPtr+1; 0074 } 0075 } 0076 if (meetMSB && curPtr > prevPtr) 0077 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 0078 0079 newLen = newptr - *newBuf; 0080 0081 return PR_TRUE; 0082 } 0083 0084 //This filter applies to all scripts which contain both English characters and upper ASCII characters. 0085 PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 0086 { 0087 //do filtering to reduce load to probers 0088 char *newptr; 0089 char *prevPtr, *curPtr; 0090 PRBool isInTag = PR_FALSE; 0091 0092 newptr = *newBuf = (char*)PR_Malloc(aLen); 0093 if (!newptr) 0094 return PR_FALSE; 0095 0096 for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 0097 { 0098 if (*curPtr == '>') 0099 isInTag = PR_FALSE; 0100 else if (*curPtr == '<') 0101 isInTag = PR_TRUE; 0102 0103 if (!(*curPtr & 0x80) && 0104 (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 0105 { 0106 if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 0107 // and it is not inside a tag, keep it. 0108 { 0109 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 0110 prevPtr++; 0111 *newptr++ = ' '; 0112 } 0113 else 0114 prevPtr = curPtr+1; 0115 } 0116 } 0117 0118 // If the current segment contains more than just a symbol 0119 // and it is not inside a tag then keep it. 0120 if (!isInTag) 0121 while (prevPtr < curPtr) 0122 *newptr++ = *prevPtr++; 0123 0124 newLen = newptr - *newBuf; 0125 0126 return PR_TRUE; 0127 } 0128 0129 #pragma GCC visibility pop 0130