File indexing completed on 2023-10-03 06:53:13

0001 /*
0002     X J D X G E N
0003     Index (.xjdx) generator program fron XJDIC
0004 
0005     v2.3 - indexes JIS X 0212 (3-byte EUC) kanji
0006     SPDX-FileCopyrightText: 1998 Jim Breen <jwb@csse.monash.edu.au>
0007     SPDX-License-Identifier: GPL-1.0-or-later
0008 */
0009 
0010 /* Changed: ignore all rc stuff. use args 1 and 2 for input/output file.
0011   -- jason */
0012 
0013 /* Heavily commented, removed the unused header file, split off the
0014     readDictionary function, removed unused functions and variables... cleaned
0015     up the code in general. Preparing for integration to the rest of the program
0016 
0017     Note that this indexer has been hacked off of Jim Breen's xjdic program,
0018     and a lot of the things which have been removed were relevant to that
0019     program, but not to this one.
0020     --Joe
0021     */
0022 
0023 #include <config-kiten.h>
0024 #include <sys/stat.h>
0025 #include <stdio.h>
0026 #include <stdlib.h>
0027 #include <ctype.h>
0028 #include <string.h>
0029 
0030 #ifdef HAVE_STDINT_H
0031 #include <stdint.h>
0032 #endif
0033 #ifdef HAVE_INTTYPES_H
0034 #include <inttypes.h>
0035 #endif
0036 
0037 #define TRUE 1
0038 #define FALSE 0
0039 #define SPTAG '@'
0040 #define TOKENLIM 40
0041 #define INDEX_VERSION 14;       /*The last time the index structure changed was Version1.4*/
0042 
0043 unsigned char *db;
0044 uint32_t  *jindex;
0045 uint32_t indlen;
0046 
0047 /*====== prototypes=================================================*/
0048 void jqsort(int32_t i, int32_t j);
0049 int Kstrcmp(uint32_t lhs, uint32_t rhs);
0050 int alphaoreuc(unsigned char x);
0051 unsigned char* readDictionary(const char* dictName,uint32_t *filesize);
0052 uint32_t buildIndex(unsigned char* dict, uint32_t dictLength);
0053 
0054 /*====function to Load Dictionary and load/create index table=======*/
0055 int main(int argc, char **argv)
0056 {
0057     const char *Dname;
0058     const char *JDXname;
0059     FILE *fp;
0060     uint32_t diclen;
0061     uint32_t indptr;
0062 
0063     printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");
0064     printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n      Copyright J.W. Breen, 1998\n");
0065 
0066     if (argc < 3)
0067     {
0068         printf("\nUSAGE: kitengen input output.xjdx\n");
0069         exit(2);
0070     }
0071 
0072     Dname = argv[1]; /*Name of the dictionary being scanned */
0073     JDXname = argv[2]; /*Name of the output file */
0074     printf("Commandline request to use files %s and %s \n", Dname, JDXname);
0075     printf("\nWARNING!!  This program may take a long time to run .....\n");
0076 
0077     db = readDictionary(Dname,&diclen); /*Reads the dict, but leaves a space at the beginning*/
0078     diclen++; /*add one to the number of bytes considered in the file */
0079     db[diclen] = 10;  /*set the first and final entry in the database to 10 */
0080     db[0] = 10;
0081     printf("Dictionary size: %d bytes.\n",diclen);
0082 
0083 
0084     indlen = (diclen * 3)/4; /*Make a wild guess at the index file length */
0085     jindex = (uint32_t *)malloc(indlen); /* and allocate it */
0086     if(jindex == NULL)
0087     {
0088         fprintf(stderr,"malloc() for index table failed.\n");
0089         exit(1);
0090     }
0091 
0092     printf("Parsing.... \n");
0093   /*this is the dictionary parser. It places an entry in jindex for every
0094    kana/kanji string and every alphabetic string it finds which is >=3
0095    characters */
0096     indptr = buildIndex(db,diclen);
0097 
0098     printf("Index entries: %d  \nSorting (this is slow)......\n",indptr);
0099     jqsort((int32_t)1,indptr);
0100 
0101     printf("Sorted\nWriting index file ....\n");
0102     fp = fopen(JDXname,"wb");
0103     if (fp==NULL )
0104     {
0105         printf("\nCannot open %s output file\n",JDXname);
0106         exit(1);
0107     }
0108     jindex[0] = diclen+INDEX_VERSION; /* prepend the index file size + version # */
0109     fwrite(jindex,sizeof(int32_t),indptr+1,fp);
0110     fclose(fp);
0111 
0112     return 0;
0113 }
0114 
0115 /*=========function to parse the dict file and fill the jindex global with the index====*/
0116 /*=========returns the size of the index file                                       ====*/
0117 /*
0118     A bit of explanation on what this thing generates is probably in order.
0119     Essentially, it fills jindex with a large number of numbers... each number
0120     being an offset to a byte location inside of the dictionary file. Starting
0121     at position index 1 (second pos)
0122     In other words... feeding this thing the dict file
0123     "Llama X1\nJT Fred Flintstone X"
0124     would generate: {<unmodified>,0,6,12,17}.
0125     "X" is skipped because it is only 1 byte long.
0126     "JT" is skipped because it is only two bytes long, the J is regular ascii
0127         (<127), and the T is not a digit. If any of those were different, (it
0128         was longer than 2 bytes, was an euc (kana or kanji) character, or T was
0129         a digit) it would be included in the index.
0130 */
0131 
0132 /*First... an ugly #define to make our code a bit more readable*/
0133 #define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \
0134     printf("Index table overflow. Dictionary too large?\n"); exit(1); } }
0135 
0136 uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) {
0137     int nowReadingWord = FALSE; /*Boolean to track if we're mid-word in the dict */
0138     uint32_t currentDictCharacter;   /*Current character index in the dict */
0139     unsigned char c;                /*the current reading character*/
0140     unsigned char currstr[TOKENLIM]; /* String that we're currently getting */
0141     int currstrIndex = 0;
0142     uint32_t indptr = 1;        /* next 'slot' in the index to fill */
0143     int saving = FALSE; /*is what we are doing right now slated for salvation?*/
0144 
0145     for (currentDictCharacter =0; currentDictCharacter < dictLength;
0146                             currentDictCharacter++)
0147     {
0148         c = dict[currentDictCharacter]; /* Fetch the next character */
0149 
0150         if(!nowReadingWord) /*if we are NOT in the middle of reading a word */
0151         {
0152             if (alphaoreuc(c) || c == SPTAG) /* if character or priority entry */
0153             {
0154                 nowReadingWord = TRUE;  /* Mark that we're mid word */
0155                 jindex[indptr] = currentDictCharacter;
0156                     /* copy the location of this character to our index structure */
0157                 currstrIndex = 1;
0158                     /*mark the next position in the string to copy a char into */
0159                 currstr[0] = c;
0160                     /*set the current string to be equal to this character so far */
0161                 currstr[1] = '\0';
0162                 saving = TRUE;
0163             }
0164         } else {        /*If we're in the middle of parsing a word atm */
0165 
0166             /*if it's alphanumeric or - or . copy it and increment where the
0167               next one goes */
0168             if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c<='9')))
0169             {
0170                 currstr[currstrIndex] = c;
0171                 if(currstrIndex < TOKENLIM-1)
0172                     currstrIndex++;
0173             }
0174             else /* We were reading a word... and we just encountered the
0175                       end of the word */
0176             {
0177                 currstr[currstrIndex] = '\0'; /*null terminate the string */
0178                 nowReadingWord = FALSE;
0179 
0180                 /*Don't save single or dual character items where the
0181                   first item is ascii */
0182                 if ((strlen((const char*)currstr) <= 2) && (currstr[0] < 127))
0183                     saving = FALSE;
0184                 /*EXCEPT: Save anything that's two character where the second
0185                   is a number
0186                     Note that this might catch single 2-byte kanji as well...
0187                     but it might not*/
0188                 if ((strlen((const char*)currstr) == 2) && (currstr[1] <= '9'))
0189                     saving = TRUE;
0190 
0191                 /* This is a latin-character string, either longer than 2 bytes
0192                     or having an ascii digit for a second byte */
0193                 if (saving && (currstr[0] < 127))
0194                 {
0195                     indptr++;
0196                     INDEX_OVERFLOW_CHECK(indptr);
0197 
0198                     /* If this is non-Japanese, and has a 'SPTAGn' tag, generate
0199                         two indices */
0200                     if ( currstr[0] == SPTAG)
0201                     {
0202                         /*make a separate entry pointing to
0203                             the non-SPTAG'd entry (the next byte)*/
0204                         jindex[indptr] = jindex[indptr-1]+1;
0205                             /*overwrite the SPTAG marker*/
0206                         strcpy((char*)currstr,(char*)(currstr+1));
0207                         indptr++;
0208                         INDEX_OVERFLOW_CHECK(indptr);
0209                     }
0210                 }
0211 
0212                 /*For strings that start with non latin characters*/
0213                 if (saving && (currstr[0] > 127))
0214                 {
0215                     uint32_t i;
0216                     uint32_t possav = jindex[indptr]; /*Save the current marker*/
0217                     indptr++;
0218                     INDEX_OVERFLOW_CHECK(indptr);
0219 
0220                     /* generate index for *every* kanji in key */
0221                     i = 2;
0222                     /*if this is a three byte kanji, ignore the 0x8f marker */
0223                     if (currstr[0] == 0x8f)
0224                         i++;
0225                     /*step through... two by two*/
0226                     for ( ;  i < strlen((const char*)currstr);  i+=2)
0227                     {
0228                         if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
0229                         {
0230                             /*Add in a specific reference to the kanji*/
0231                             jindex[indptr] = possav+i;
0232                             indptr++;
0233                             INDEX_OVERFLOW_CHECK(indptr);
0234                         }
0235                         /*again the check if it's a three byte kanji*/
0236                         if(currstr[i] == 0x8f)
0237                             i++;
0238                     }
0239                 }
0240             }
0241         }
0242     }
0243     indptr--; /*correct for the overshoot */
0244     return indptr;
0245 }
0246 
0247 /*===function to read the dictionary files into array, returning filesize===*/
0248 /*Note: We leave a blank byte in the first byte of the returned dictionary, and
0249   allocate an extra 99 bytes at the end */
0250 unsigned char*
0251 readDictionary(const char* dictName,uint32_t *filesize) {
0252   FILE *fp;
0253   struct stat buf;
0254   unsigned char *memDictionary;
0255   int nodread;
0256 
0257   if(stat(dictName, &buf) != 0) /* if the dict file doesn't exist */
0258   {
0259      perror(NULL);
0260      printf("Cannot stat: %s \n",dictName);
0261      exit(1);
0262   }
0263 
0264   *filesize = buf.st_size; /*file size in bytes*/
0265 
0266   puts ("\nLoading Dictionary file.  Please wait.....\n");
0267   fp=fopen(dictName,"rb");
0268   if (fp==NULL )
0269   {
0270     printf("\nCannot open dictionary file\n");
0271     exit(1);
0272   }
0273   /*Allocate the database index 100 bytes larger than the dict filesize*/
0274   memDictionary=(unsigned char*)malloc((*filesize+100)*sizeof(unsigned char));
0275   if(memDictionary == NULL)
0276   {
0277       fprintf(stderr,"malloc() for dictionary failed.\n");
0278       fclose(fp);
0279       exit(1);
0280   }
0281 
0282   nodread = (*filesize)/1024; /*number of kilobytes in the file */
0283   /*reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1*/
0284   fread((unsigned char *)memDictionary+1, 1024, nodread, fp);
0285   nodread = (*filesize) % 1024; /* "leftover" bytes after the previous read */
0286   /*reads the remaining bytes from fp... for what filesystem is this split-read needed?*/
0287   fread((unsigned char *)(memDictionary+((*filesize)/1024)*1024)+1, nodread,1, fp);
0288   fclose(fp);
0289 
0290   return memDictionary;
0291 }
0292 
0293 /*======function to sort jindex table====================*/
0294 /*see the index generator for information about what jindex contains
0295   This simply sorts that output according to the data in the dictionary*/
0296 void jqsort(int32_t lhs, int32_t rhs)
0297 {
0298     int32_t i,last,midp;
0299     uint32_t temp;
0300 
0301     if (lhs >= rhs) return;
0302 
0303     midp = (lhs+rhs)/2; /* calculate the midpoint */
0304 
0305     /*Swap (midp,lhs) */
0306     temp = jindex[lhs];
0307     jindex[lhs] = jindex[midp];
0308     jindex[midp] = temp;
0309 
0310     last = lhs;
0311     for (i = lhs+1;i <= rhs; i++)
0312         {
0313             if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
0314             {
0315                 /* Swap(++last,i);*/
0316                 last++;
0317                 temp = jindex[i];
0318                 jindex[i] = jindex[last];
0319                 jindex[last] = temp;
0320             }
0321         }
0322 
0323 /*  Swap (lhs,last);*/
0324     temp = jindex[lhs];
0325     jindex[lhs] = jindex[last];
0326     jindex[last] = temp;
0327 
0328     jqsort(lhs,last-1);
0329     jqsort(last+1,rhs);
0330 }
0331 
0332 /*=====string comparison used by jqsort==========================*/
0333 int Kstrcmp(uint32_t lhs, uint32_t rhs)
0334 {
0335     int i,c1 = 0, c2 = 0;
0336 /* effectively does a strnicmp on two "strings" within the dictionary,
0337    except it will make katakana and hirgana match (EUC A4 & A5) */
0338 
0339     for (i = 0; i<20 ; i++) /*Compare up to 20 chars*/
0340     {
0341         c1 = db[lhs+i];
0342         c2 = db[rhs+i];
0343 
0344         if ((i % 2) == 0) /*If we're reading the first byte*/
0345         {
0346             if (c1 == 0xA5) /*Change hiragana to katakana for */
0347                 c1 = 0xA4;   /*The purposes of this comparison */
0348             if (c2 == 0xA5)
0349                 c2 = 0xA4;
0350         }
0351 
0352         /*If this is ascii, remove the difference between capitals and small*/
0353         if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
0354         if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
0355 
0356         if (c1 != c2 ) break;
0357     }
0358     return(c1-c2);
0359 }
0360 
0361 /*=======function to test a character for alpha or kana/kanji====*/
0362 int alphaoreuc(unsigned char x)
0363 {
0364     int c;
0365 
0366     c = x & 0xff;
0367     if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
0368         /*ASCII alphabet*/
0369     {
0370         return (TRUE);
0371     }
0372     if ((c >= '0') && (c <= '9'))
0373         /*digits*/
0374     {
0375         return(TRUE);
0376     }
0377     if ((c & 0x80) > 0)
0378         /*EUC kanji/kana*/
0379     {
0380         return(TRUE);
0381     }
0382     return (FALSE);
0383 }
0384