File indexing completed on 2023-10-03 06:53:13
0001 /* 0002 X J D X G E N 0003 Index (.xjdx) generator program fron XJDIC 0004 0005 v2.3 - indexes JIS X 0212 (3-byte EUC) kanji 0006 SPDX-FileCopyrightText: 1998 Jim Breen <jwb@csse.monash.edu.au> 0007 SPDX-License-Identifier: GPL-1.0-or-later 0008 */ 0009 0010 /* Changed: ignore all rc stuff. use args 1 and 2 for input/output file. 0011 -- jason */ 0012 0013 /* Heavily commented, removed the unused header file, split off the 0014 readDictionary function, removed unused functions and variables... cleaned 0015 up the code in general. Preparing for integration to the rest of the program 0016 0017 Note that this indexer has been hacked off of Jim Breen's xjdic program, 0018 and a lot of the things which have been removed were relevant to that 0019 program, but not to this one. 0020 --Joe 0021 */ 0022 0023 #include <config-kiten.h> 0024 #include <sys/stat.h> 0025 #include <stdio.h> 0026 #include <stdlib.h> 0027 #include <ctype.h> 0028 #include <string.h> 0029 0030 #ifdef HAVE_STDINT_H 0031 #include <stdint.h> 0032 #endif 0033 #ifdef HAVE_INTTYPES_H 0034 #include <inttypes.h> 0035 #endif 0036 0037 #define TRUE 1 0038 #define FALSE 0 0039 #define SPTAG '@' 0040 #define TOKENLIM 40 0041 #define INDEX_VERSION 14; /*The last time the index structure changed was Version1.4*/ 0042 0043 unsigned char *db; 0044 uint32_t *jindex; 0045 uint32_t indlen; 0046 0047 /*====== prototypes=================================================*/ 0048 void jqsort(int32_t i, int32_t j); 0049 int Kstrcmp(uint32_t lhs, uint32_t rhs); 0050 int alphaoreuc(unsigned char x); 0051 unsigned char* readDictionary(const char* dictName,uint32_t *filesize); 0052 uint32_t buildIndex(unsigned char* dict, uint32_t dictLength); 0053 0054 /*====function to Load Dictionary and load/create index table=======*/ 0055 int main(int argc, char **argv) 0056 { 0057 const char *Dname; 0058 const char *JDXname; 0059 FILE *fp; 0060 uint32_t diclen; 0061 uint32_t indptr; 0062 0063 printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n"); 0064 printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n"); 0065 0066 if (argc < 3) 0067 { 0068 printf("\nUSAGE: kitengen input output.xjdx\n"); 0069 exit(2); 0070 } 0071 0072 Dname = argv[1]; /*Name of the dictionary being scanned */ 0073 JDXname = argv[2]; /*Name of the output file */ 0074 printf("Commandline request to use files %s and %s \n", Dname, JDXname); 0075 printf("\nWARNING!! This program may take a long time to run .....\n"); 0076 0077 db = readDictionary(Dname,&diclen); /*Reads the dict, but leaves a space at the beginning*/ 0078 diclen++; /*add one to the number of bytes considered in the file */ 0079 db[diclen] = 10; /*set the first and final entry in the database to 10 */ 0080 db[0] = 10; 0081 printf("Dictionary size: %d bytes.\n",diclen); 0082 0083 0084 indlen = (diclen * 3)/4; /*Make a wild guess at the index file length */ 0085 jindex = (uint32_t *)malloc(indlen); /* and allocate it */ 0086 if(jindex == NULL) 0087 { 0088 fprintf(stderr,"malloc() for index table failed.\n"); 0089 exit(1); 0090 } 0091 0092 printf("Parsing.... \n"); 0093 /*this is the dictionary parser. It places an entry in jindex for every 0094 kana/kanji string and every alphabetic string it finds which is >=3 0095 characters */ 0096 indptr = buildIndex(db,diclen); 0097 0098 printf("Index entries: %d \nSorting (this is slow)......\n",indptr); 0099 jqsort((int32_t)1,indptr); 0100 0101 printf("Sorted\nWriting index file ....\n"); 0102 fp = fopen(JDXname,"wb"); 0103 if (fp==NULL ) 0104 { 0105 printf("\nCannot open %s output file\n",JDXname); 0106 exit(1); 0107 } 0108 jindex[0] = diclen+INDEX_VERSION; /* prepend the index file size + version # */ 0109 fwrite(jindex,sizeof(int32_t),indptr+1,fp); 0110 fclose(fp); 0111 0112 return 0; 0113 } 0114 0115 /*=========function to parse the dict file and fill the jindex global with the index====*/ 0116 /*=========returns the size of the index file ====*/ 0117 /* 0118 A bit of explanation on what this thing generates is probably in order. 0119 Essentially, it fills jindex with a large number of numbers... each number 0120 being an offset to a byte location inside of the dictionary file. Starting 0121 at position index 1 (second pos) 0122 In other words... feeding this thing the dict file 0123 "Llama X1\nJT Fred Flintstone X" 0124 would generate: {<unmodified>,0,6,12,17}. 0125 "X" is skipped because it is only 1 byte long. 0126 "JT" is skipped because it is only two bytes long, the J is regular ascii 0127 (<127), and the T is not a digit. If any of those were different, (it 0128 was longer than 2 bytes, was an euc (kana or kanji) character, or T was 0129 a digit) it would be included in the index. 0130 */ 0131 0132 /*First... an ugly #define to make our code a bit more readable*/ 0133 #define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \ 0134 printf("Index table overflow. Dictionary too large?\n"); exit(1); } } 0135 0136 uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) { 0137 int nowReadingWord = FALSE; /*Boolean to track if we're mid-word in the dict */ 0138 uint32_t currentDictCharacter; /*Current character index in the dict */ 0139 unsigned char c; /*the current reading character*/ 0140 unsigned char currstr[TOKENLIM]; /* String that we're currently getting */ 0141 int currstrIndex = 0; 0142 uint32_t indptr = 1; /* next 'slot' in the index to fill */ 0143 int saving = FALSE; /*is what we are doing right now slated for salvation?*/ 0144 0145 for (currentDictCharacter =0; currentDictCharacter < dictLength; 0146 currentDictCharacter++) 0147 { 0148 c = dict[currentDictCharacter]; /* Fetch the next character */ 0149 0150 if(!nowReadingWord) /*if we are NOT in the middle of reading a word */ 0151 { 0152 if (alphaoreuc(c) || c == SPTAG) /* if character or priority entry */ 0153 { 0154 nowReadingWord = TRUE; /* Mark that we're mid word */ 0155 jindex[indptr] = currentDictCharacter; 0156 /* copy the location of this character to our index structure */ 0157 currstrIndex = 1; 0158 /*mark the next position in the string to copy a char into */ 0159 currstr[0] = c; 0160 /*set the current string to be equal to this character so far */ 0161 currstr[1] = '\0'; 0162 saving = TRUE; 0163 } 0164 } else { /*If we're in the middle of parsing a word atm */ 0165 0166 /*if it's alphanumeric or - or . copy it and increment where the 0167 next one goes */ 0168 if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c<='9'))) 0169 { 0170 currstr[currstrIndex] = c; 0171 if(currstrIndex < TOKENLIM-1) 0172 currstrIndex++; 0173 } 0174 else /* We were reading a word... and we just encountered the 0175 end of the word */ 0176 { 0177 currstr[currstrIndex] = '\0'; /*null terminate the string */ 0178 nowReadingWord = FALSE; 0179 0180 /*Don't save single or dual character items where the 0181 first item is ascii */ 0182 if ((strlen((const char*)currstr) <= 2) && (currstr[0] < 127)) 0183 saving = FALSE; 0184 /*EXCEPT: Save anything that's two character where the second 0185 is a number 0186 Note that this might catch single 2-byte kanji as well... 0187 but it might not*/ 0188 if ((strlen((const char*)currstr) == 2) && (currstr[1] <= '9')) 0189 saving = TRUE; 0190 0191 /* This is a latin-character string, either longer than 2 bytes 0192 or having an ascii digit for a second byte */ 0193 if (saving && (currstr[0] < 127)) 0194 { 0195 indptr++; 0196 INDEX_OVERFLOW_CHECK(indptr); 0197 0198 /* If this is non-Japanese, and has a 'SPTAGn' tag, generate 0199 two indices */ 0200 if ( currstr[0] == SPTAG) 0201 { 0202 /*make a separate entry pointing to 0203 the non-SPTAG'd entry (the next byte)*/ 0204 jindex[indptr] = jindex[indptr-1]+1; 0205 /*overwrite the SPTAG marker*/ 0206 strcpy((char*)currstr,(char*)(currstr+1)); 0207 indptr++; 0208 INDEX_OVERFLOW_CHECK(indptr); 0209 } 0210 } 0211 0212 /*For strings that start with non latin characters*/ 0213 if (saving && (currstr[0] > 127)) 0214 { 0215 uint32_t i; 0216 uint32_t possav = jindex[indptr]; /*Save the current marker*/ 0217 indptr++; 0218 INDEX_OVERFLOW_CHECK(indptr); 0219 0220 /* generate index for *every* kanji in key */ 0221 i = 2; 0222 /*if this is a three byte kanji, ignore the 0x8f marker */ 0223 if (currstr[0] == 0x8f) 0224 i++; 0225 /*step through... two by two*/ 0226 for ( ; i < strlen((const char*)currstr); i+=2) 0227 { 0228 if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f)) 0229 { 0230 /*Add in a specific reference to the kanji*/ 0231 jindex[indptr] = possav+i; 0232 indptr++; 0233 INDEX_OVERFLOW_CHECK(indptr); 0234 } 0235 /*again the check if it's a three byte kanji*/ 0236 if(currstr[i] == 0x8f) 0237 i++; 0238 } 0239 } 0240 } 0241 } 0242 } 0243 indptr--; /*correct for the overshoot */ 0244 return indptr; 0245 } 0246 0247 /*===function to read the dictionary files into array, returning filesize===*/ 0248 /*Note: We leave a blank byte in the first byte of the returned dictionary, and 0249 allocate an extra 99 bytes at the end */ 0250 unsigned char* 0251 readDictionary(const char* dictName,uint32_t *filesize) { 0252 FILE *fp; 0253 struct stat buf; 0254 unsigned char *memDictionary; 0255 int nodread; 0256 0257 if(stat(dictName, &buf) != 0) /* if the dict file doesn't exist */ 0258 { 0259 perror(NULL); 0260 printf("Cannot stat: %s \n",dictName); 0261 exit(1); 0262 } 0263 0264 *filesize = buf.st_size; /*file size in bytes*/ 0265 0266 puts ("\nLoading Dictionary file. Please wait.....\n"); 0267 fp=fopen(dictName,"rb"); 0268 if (fp==NULL ) 0269 { 0270 printf("\nCannot open dictionary file\n"); 0271 exit(1); 0272 } 0273 /*Allocate the database index 100 bytes larger than the dict filesize*/ 0274 memDictionary=(unsigned char*)malloc((*filesize+100)*sizeof(unsigned char)); 0275 if(memDictionary == NULL) 0276 { 0277 fprintf(stderr,"malloc() for dictionary failed.\n"); 0278 fclose(fp); 0279 exit(1); 0280 } 0281 0282 nodread = (*filesize)/1024; /*number of kilobytes in the file */ 0283 /*reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1*/ 0284 fread((unsigned char *)memDictionary+1, 1024, nodread, fp); 0285 nodread = (*filesize) % 1024; /* "leftover" bytes after the previous read */ 0286 /*reads the remaining bytes from fp... for what filesystem is this split-read needed?*/ 0287 fread((unsigned char *)(memDictionary+((*filesize)/1024)*1024)+1, nodread,1, fp); 0288 fclose(fp); 0289 0290 return memDictionary; 0291 } 0292 0293 /*======function to sort jindex table====================*/ 0294 /*see the index generator for information about what jindex contains 0295 This simply sorts that output according to the data in the dictionary*/ 0296 void jqsort(int32_t lhs, int32_t rhs) 0297 { 0298 int32_t i,last,midp; 0299 uint32_t temp; 0300 0301 if (lhs >= rhs) return; 0302 0303 midp = (lhs+rhs)/2; /* calculate the midpoint */ 0304 0305 /*Swap (midp,lhs) */ 0306 temp = jindex[lhs]; 0307 jindex[lhs] = jindex[midp]; 0308 jindex[midp] = temp; 0309 0310 last = lhs; 0311 for (i = lhs+1;i <= rhs; i++) 0312 { 0313 if (Kstrcmp(jindex[i],jindex[lhs]) < 0) 0314 { 0315 /* Swap(++last,i);*/ 0316 last++; 0317 temp = jindex[i]; 0318 jindex[i] = jindex[last]; 0319 jindex[last] = temp; 0320 } 0321 } 0322 0323 /* Swap (lhs,last);*/ 0324 temp = jindex[lhs]; 0325 jindex[lhs] = jindex[last]; 0326 jindex[last] = temp; 0327 0328 jqsort(lhs,last-1); 0329 jqsort(last+1,rhs); 0330 } 0331 0332 /*=====string comparison used by jqsort==========================*/ 0333 int Kstrcmp(uint32_t lhs, uint32_t rhs) 0334 { 0335 int i,c1 = 0, c2 = 0; 0336 /* effectively does a strnicmp on two "strings" within the dictionary, 0337 except it will make katakana and hirgana match (EUC A4 & A5) */ 0338 0339 for (i = 0; i<20 ; i++) /*Compare up to 20 chars*/ 0340 { 0341 c1 = db[lhs+i]; 0342 c2 = db[rhs+i]; 0343 0344 if ((i % 2) == 0) /*If we're reading the first byte*/ 0345 { 0346 if (c1 == 0xA5) /*Change hiragana to katakana for */ 0347 c1 = 0xA4; /*The purposes of this comparison */ 0348 if (c2 == 0xA5) 0349 c2 = 0xA4; 0350 } 0351 0352 /*If this is ascii, remove the difference between capitals and small*/ 0353 if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20; 0354 if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; 0355 0356 if (c1 != c2 ) break; 0357 } 0358 return(c1-c2); 0359 } 0360 0361 /*=======function to test a character for alpha or kana/kanji====*/ 0362 int alphaoreuc(unsigned char x) 0363 { 0364 int c; 0365 0366 c = x & 0xff; 0367 if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122))) 0368 /*ASCII alphabet*/ 0369 { 0370 return (TRUE); 0371 } 0372 if ((c >= '0') && (c <= '9')) 0373 /*digits*/ 0374 { 0375 return(TRUE); 0376 } 0377 if ((c & 0x80) > 0) 0378 /*EUC kanji/kana*/ 0379 { 0380 return(TRUE); 0381 } 0382 return (FALSE); 0383 } 0384