File indexing completed on 2025-01-26 04:44:00
0001 /* ------------------------------------------------------------------------ 0002 @NAME : string_util.c 0003 @DESCRIPTION: Various string-processing utility functions: 0004 bt_purify_string() 0005 bt_change_case() 0006 0007 and their helpers: 0008 foreign_letter() 0009 purify_special_char() 0010 @GLOBALS : 0011 @CALLS : 0012 @CALLERS : 0013 @CREATED : 1997/10/19, Greg Ward 0014 @MODIFIED : 1997/11/25, GPW: renamed to from purify.c to string_util.c 0015 added bt_change_case() and friends 0016 @VERSION : $Id: string_util.c,v 1.10 1999/10/28 22:50:28 greg Rel $ 0017 -------------------------------------------------------------------------- */ 0018 0019 #include <stdlib.h> 0020 #include <ctype.h> 0021 #include <string.h> 0022 #include <assert.h> 0023 #include "error.h" 0024 #include "btparse.h" 0025 #include "bt_debug.h" 0026 0027 0028 /* 0029 * These definitions should be fixed to be consistent with HTML 0030 * entities, just for fun. And perhaps I should add entries for 0031 * accented letters (at least those supported by TeX and HTML). 0032 */ 0033 typedef enum 0034 { 0035 L_OTHER, /* not a "foreign" letter */ 0036 L_OSLASH_L, /* Eastern European {\o} */ 0037 L_OSLASH_U, 0038 L_LSLASH_L, /* {\l} */ 0039 L_LSLASH_U, 0040 L_OELIG_L, /* Latin {\oe} ligature */ 0041 L_OELIG_U, 0042 L_AELIG_L, /* {\ae} ligature */ 0043 L_AELIG_U, 0044 L_SSHARP_L, /* German "sharp s" {\ss} */ 0045 L_SSHARP_U, 0046 L_ACIRCLE_L, /* Nordic {\aa} */ 0047 L_ACIRCLE_U, 0048 L_INODOT_L, /* undotted i: {\i} */ 0049 L_JNODOT_L /* {\j} */ 0050 } bt_letter; 0051 0052 0053 static const char * uc_version[] = 0054 { 0055 NULL, /* L_OTHER */ 0056 "\\O", /* L_OSLASH_L */ 0057 "\\O", /* L_OSLASH_U */ 0058 "\\L", /* L_LSLASH_L */ 0059 "\\L", /* L_LSLASH_U */ 0060 "\\OE", /* L_OELIG_L */ 0061 "\\OE", /* L_OELIG_U */ 0062 "\\AE", /* L_AELIG_L */ 0063 "\\AE", /* L_AELIG_U */ 0064 "SS", /* L_SSHARP_L -- for LaTeX 2.09 */ 0065 "\\SS", /* L_SSHARP_U */ 0066 "\\AA", /* L_ACIRCLE_L */ 0067 "\\AA", /* L_ACIRCLE_U */ 0068 "I", /* L_INODOT_L */ 0069 "J" /* L_JNODOT_L */ 0070 }; 0071 0072 static const char * lc_version[] = 0073 { 0074 NULL, /* L_OTHER */ 0075 "\\o", /* L_OSLASH_L */ 0076 "\\o", /* L_OSLASH_U */ 0077 "\\l", /* L_LSLASH_L */ 0078 "\\l", /* L_LSLASH_U */ 0079 "\\oe", /* L_OELIG_L */ 0080 "\\oe", /* L_OELIG_U */ 0081 "\\ae", /* L_AELIG_L */ 0082 "\\ae", /* L_AELIG_U */ 0083 "\\ss", /* L_SSHARP_L */ 0084 "\\ss", /* L_SSHARP_U */ 0085 "\\aa", /* L_ACIRCLE_L */ 0086 "\\aa", /* L_ACIRCLE_U */ 0087 "\\i", /* L_INODOT_L */ 0088 "\\j" /* L_JNODOT_L */ 0089 }; 0090 0091 0092 0093 /* ------------------------------------------------------------------------ 0094 @NAME : foreign_letter() 0095 @INPUT : str 0096 start 0097 stop 0098 @OUTPUT : letter 0099 @RETURNS : TRUE if the string delimited by start and stop is a foreign 0100 letter control sequence 0101 @DESCRIPTION: Determines if a character sequence is one of (La)TeX's 0102 "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus 0103 uppercase versions). If `letter' is non-NULL, returns which 0104 letter was found in it (as a bt_letter value). 0105 @CALLS : 0106 @CALLERS : purify_special_char() 0107 @CREATED : 1997/10/19, GPW 0108 @MODIFIED : 0109 -------------------------------------------------------------------------- */ 0110 static boolean 0111 foreign_letter (char *str, int start, int stop, bt_letter * letter) 0112 { 0113 char c1, c2; 0114 bt_letter dummy; 0115 0116 0117 /* 0118 * This is written for speed, not flexibility -- adding new foreign 0119 * letters would be trying and vexatious. 0120 * 0121 * N.B. my gold standard list of foreign letters is Kopka and Daly's 0122 * *A Guide to LaTeX 2e*, section 2.5.6. 0123 */ 0124 0125 if (letter == NULL) /* so we can assign to *letter */ 0126 letter = &dummy; /* without compunctions */ 0127 *letter = L_OTHER; /* assume not a "foreign" letter */ 0128 0129 c1 = str[start+0]; /* only two characters that we're */ 0130 c2 = str[start+1]; /* interested in */ 0131 0132 switch (stop - start) 0133 { 0134 case 1: /* one-character control sequences */ 0135 switch (c1) /* (\o and \l) */ 0136 { 0137 case 'o': 0138 *letter = L_OSLASH_L; return TRUE; 0139 case 'O': 0140 *letter = L_OSLASH_U; return TRUE; 0141 case 'l': 0142 *letter = L_LSLASH_L; return TRUE; 0143 case 'L': 0144 *letter = L_LSLASH_L; return TRUE; 0145 case 'i': 0146 *letter = L_INODOT_L; return TRUE; 0147 case 'j': 0148 *letter = L_JNODOT_L; return TRUE; 0149 default: 0150 return FALSE; 0151 } 0152 break; 0153 case 2: /* two character control sequences */ 0154 switch (c1) /* (\oe, \ae, \aa, and \ss) */ 0155 { 0156 case 'o': 0157 if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; } 0158 case 'O': 0159 if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; } 0160 0161 /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/ 0162 case 'a': 0163 if (c2 == 'e') 0164 { *letter = L_AELIG_L; return TRUE; } 0165 else if (c2 == 'a') 0166 { *letter = L_ACIRCLE_L; return TRUE; } 0167 else 0168 return FALSE; 0169 case 'A': 0170 if (c2 == 'E') 0171 { *letter = L_AELIG_U; return TRUE; } 0172 else if (c2 == 'A') 0173 { *letter = L_ACIRCLE_U; return TRUE; } 0174 else 0175 return FALSE; 0176 0177 /* uppercase sharp-s -- new with LaTeX 2e (so far all I do 0178 * is recognize it as a "foreign" letter) 0179 */ 0180 case 's': 0181 if (c2 == 's') 0182 { *letter = L_SSHARP_L; return TRUE; } 0183 else 0184 return FALSE; 0185 case 'S': 0186 if (c2 == 'S') 0187 { *letter = L_SSHARP_U; return TRUE; } 0188 else 0189 return FALSE; 0190 } 0191 break; 0192 default: 0193 return FALSE; 0194 } /* switch on length of control sequence */ 0195 0196 internal_error ("foreign_letter(): should never reach end of function"); 0197 return FALSE; /* to keep gcc -Wall happy */ 0198 0199 } /* foreign_letter */ 0200 0201 0202 /* ------------------------------------------------------------------------ 0203 @NAME : purify_special_char() 0204 @INPUT : *src, *dst - pointers into the input and output strings 0205 @OUTPUT : *src - updated to point to the closing brace of the 0206 special char 0207 *dst - updated to point to the next available spot 0208 for copying text to 0209 @RETURNS : 0210 @DESCRIPTION: "Purifies" a BibTeX special character. On input, *src should 0211 point to the opening brace of a special character (ie. the 0212 brace must be at depth 0 of the whole string, and the 0213 character immediately following it must be a backslash). 0214 *dst should point to the next spot to copy into the output 0215 (purified) string. purify_special_char() will skip over the 0216 opening brace and backslash; if the control sequence is one 0217 of LaTeX's foreign letter sequences (as determined by 0218 foreign_letter()), then it is simply copied to *dst. 0219 Otherwise the control sequence is skipped. In either case, 0220 text after the control sequence is either copied (alphabetic 0221 characters) or skipped (anything else, including hyphens, 0222 ties, and digits). 0223 @CALLS : foreign_letter() 0224 @CALLERS : bt_purify_string() 0225 @CREATED : 1997/10/19, GPW 0226 @MODIFIED : 0227 -------------------------------------------------------------------------- */ 0228 static void 0229 purify_special_char (char *str, int * src, int * dst) 0230 { 0231 int depth; 0232 int peek; 0233 0234 assert (str[*src] == '{' && str[*src + 1] == '\\'); 0235 depth = 1; 0236 0237 *src += 2; /* jump to start of control sequence */ 0238 peek = *src; /* scan to end of control sequence */ 0239 while (isalpha (str[peek])) 0240 peek++; 0241 if (peek == *src) /* in case of single-char, non-alpha */ 0242 peek++; /* control sequence (eg. {\'e}) */ 0243 0244 if (foreign_letter (str, *src, peek, NULL)) 0245 { 0246 assert (peek - *src == 1 || peek - *src == 2); 0247 str[(*dst)++] = str[(*src)++]; /* copy first char */ 0248 if (*src < peek) /* copy second char, downcasing */ 0249 str[(*dst)++] = tolower (str[(*src)++]); 0250 } 0251 else /* not a foreign letter -- skip */ 0252 { /* the control sequence entirely */ 0253 *src = peek; 0254 } 0255 0256 while (str[*src]) 0257 { 0258 switch (str[*src]) 0259 { 0260 case '{': 0261 depth++; 0262 (*src)++; 0263 break; 0264 case '}': 0265 depth--; 0266 if (depth == 0) return; /* done with special char */ 0267 (*src)++; 0268 break; 0269 default: 0270 if (isalpha (str[*src])) /* copy alphabetic chars */ 0271 str[(*dst)++] = str[(*src)++]; 0272 else /* skip everything else */ 0273 (*src)++; 0274 } 0275 } 0276 0277 /* 0278 * If we get here, we have unbalanced braces -- the '}' case should 0279 * always hit a depth == 0 point if braces are balanced. No warning, 0280 * though, because a) BibTeX doesn't warn about purifying unbalanced 0281 * strings, and b) we (should have) already warned about it in the 0282 * lexer. 0283 */ 0284 0285 } /* purify_special_char() */ 0286 0287 0288 /* ------------------------------------------------------------------------ 0289 @NAME : bt_purify_string() 0290 @INOUT : instr 0291 @INPUT : options 0292 @OUTPUT : 0293 @RETURNS : instr - same as input string, but modified in place 0294 @DESCRIPTION: "Purifies" a BibTeX string. This consists of copying 0295 alphanumeric characters, converting hyphens and ties to 0296 space, copying spaces, and skipping everything else. (Well, 0297 almost -- special characters are handled specially, of 0298 course. Basically, accented letters have the control 0299 sequence skipped, while foreign letters have the control 0300 sequence preserved in a reasonable manner. See 0301 purify_special_char() for details.) 0302 @CALLS : purify_special_char() 0303 @CALLERS : 0304 @CREATED : 1997/10/19, GPW 0305 @MODIFIED : 0306 -------------------------------------------------------------------------- */ 0307 void 0308 bt_purify_string (char * string, ushort options) 0309 { 0310 int src, /* both indices into string */ 0311 dst; 0312 int depth; /* brace depth in string */ 0313 unsigned orig_len; 0314 0315 /* 0316 * Since purification always copies or deletes chars, outstr will 0317 * be no longer than string -- so nothing fancy is required to put 0318 * an upper bound on its eventual size. 0319 */ 0320 0321 depth = 0; 0322 src = 0; 0323 dst = 0; 0324 orig_len = strlen (string); 0325 0326 DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n", 0327 string, string)); 0328 0329 while (string[src] != (char) 0) 0330 { 0331 DBG_ACTION (2, printf (" next: >%c<: ", string[src])); 0332 switch (string[src]) 0333 { 0334 case '~': /* "separator" characters -- */ 0335 case '-': /* replaced with space */ 0336 case ' ': /* and copy an actual space */ 0337 string[dst++] = ' '; 0338 src++; 0339 DBG_ACTION (2, printf ("replacing with space")); 0340 break; 0341 case '{': 0342 if (depth == 0 && string[src+1] == '\\') 0343 { 0344 DBG_ACTION (2, printf ("special char found")); 0345 purify_special_char (string, &src, &dst); 0346 } 0347 else 0348 { 0349 DBG_ACTION (2, printf ("ordinary open brace")); 0350 src++; 0351 } 0352 depth++; 0353 break; 0354 case '}': 0355 DBG_ACTION (2, printf ("close brace")); 0356 depth--; 0357 src++; 0358 break; 0359 default: 0360 if (isalnum (string[src])) /* any alphanumeric char -- */ 0361 { 0362 DBG_ACTION (2, printf ("alphanumeric -- copying")); 0363 string[dst++] = string[src++]; /* copy it */ 0364 } 0365 else /* anything else -- skip it */ 0366 { 0367 DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha")); 0368 src++; 0369 } 0370 } /* switch string[src] */ 0371 0372 DBG_ACTION (2, printf ("\n")); 0373 0374 } /* while string[src] */ 0375 0376 DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth)); 0377 0378 string[dst] = (char) 0; 0379 assert (strlen (string) <= orig_len); 0380 } /* bt_purify_string() */ 0381 0382 0383 /* ====================================================================== 0384 * Case-transformation stuff 0385 */ 0386 0387 0388 /* ------------------------------------------------------------------------ 0389 @NAME : convert_special_char() 0390 @INPUT : transform 0391 @INOUT : string 0392 src 0393 dst 0394 start_sentence 0395 after_colon 0396 @RETURNS : 0397 @DESCRIPTION: Does case conversion on a special character. 0398 @GLOBALS : 0399 @CALLS : 0400 @CALLERS : 0401 @CREATED : 1997/11/25, GPW 0402 @MODIFIED : 0403 -------------------------------------------------------------------------- */ 0404 static void 0405 convert_special_char (char transform, 0406 char * string, 0407 int * src, 0408 int * dst, 0409 boolean * start_sentence, 0410 boolean * after_colon) 0411 { 0412 int depth; 0413 boolean done_special; 0414 int cs_end; 0415 int cs_len; /* counting the backslash */ 0416 bt_letter letter; 0417 const char * repl; 0418 int repl_len; 0419 0420 #ifndef ALLOW_WARNINGS 0421 repl = NULL; /* silence "might be used" */ 0422 /* uninitialized" warning */ 0423 #endif 0424 0425 /* First, copy just the opening brace */ 0426 string[(*dst)++] = string[(*src)++]; 0427 0428 /* 0429 * Now loop over characters inside the braces -- stop when we reach 0430 * the matching close brace, or when the string ends. 0431 */ 0432 depth = 1; /* because we're in a special char */ 0433 done_special = FALSE; 0434 0435 while (string[*src] != 0 && !done_special) 0436 { 0437 switch (string[*src]) 0438 { 0439 case '\\': /* a control sequence */ 0440 { 0441 cs_end = *src+1; /* scan over chars of c.s. */ 0442 while (isalpha (string[cs_end])) 0443 cs_end++; 0444 0445 /* 0446 * OK, now *src points to the backslash (so src+*1 points to 0447 * first char. of control sequence), and cs_end points to 0448 * character immediately following end of control sequence. 0449 * Thus we analyze [*src+1..cs_end] to determine if the control 0450 * sequence is a foreign letter, and use (cs_end - (*src+1) + 1) 0451 * = (cs_end - *src) as the length of the control sequence. 0452 */ 0453 0454 cs_len = cs_end - *src; /* length of cs, counting backslash */ 0455 0456 if (foreign_letter (string, *src+1, cs_end, &letter)) 0457 { 0458 if (letter == L_OTHER) 0459 internal_error ("impossible foreign letter"); 0460 0461 switch (transform) 0462 { 0463 case 'u': 0464 repl = uc_version[(int) letter]; 0465 break; 0466 case 'l': 0467 repl = lc_version[(int) letter]; 0468 break; 0469 case 't': 0470 if (*start_sentence || *after_colon) 0471 { 0472 repl = uc_version[(int) letter]; 0473 *start_sentence = *after_colon = FALSE; 0474 } 0475 else 0476 { 0477 repl = lc_version[(int) letter]; 0478 } 0479 break; 0480 default: 0481 internal_error ("impossible case transform \"%c\"", 0482 transform); 0483 } 0484 0485 repl_len = strlen (repl); 0486 if (repl_len > cs_len) 0487 internal_error 0488 ("replacement text longer than original cs"); 0489 0490 strncpy (string + *dst, repl, repl_len); 0491 *src = cs_end; 0492 *dst += repl_len; 0493 } /* control sequence is a foreign letter */ 0494 else 0495 { 0496 /* not a foreign letter -- just copy the control seq. as is */ 0497 0498 0499 strncpy (string + *dst, string + *src, cs_end - *src); 0500 *src += cs_len; 0501 assert (*src == cs_end); 0502 *dst += cs_len; 0503 } /* control sequence not a foreign letter */ 0504 0505 break; 0506 } /* case: '\\' */ 0507 0508 case '{': 0509 { 0510 string[(*dst)++] = string[(*src)++]; 0511 depth++; 0512 break; 0513 } 0514 0515 case '}': 0516 { 0517 string[(*dst)++] = string[(*src)++]; 0518 depth--; 0519 if (depth == 0) 0520 done_special = TRUE; 0521 break; 0522 } 0523 0524 default: /* any other character */ 0525 { 0526 switch (transform) 0527 { 0528 /* 0529 * Inside special chars, lowercase and title caps are same. 0530 * (At least, that's bibtex's convention. I might change this 0531 * at some point to be a bit smarter.) 0532 */ 0533 case 'l': 0534 case 't': 0535 string[(*dst)++] = tolower (string[(*src)++]); 0536 break; 0537 case 'u': 0538 string[(*dst)++] = toupper (string[(*src)++]); 0539 break; 0540 default: 0541 internal_error ("impossible case transform \"%c\"", 0542 transform); 0543 } 0544 } /* default char */ 0545 0546 } /* switch: current char */ 0547 0548 } /* while: string or special char not done */ 0549 0550 } /* convert_special_char() */ 0551 0552 0553 /* ------------------------------------------------------------------------ 0554 @NAME : bt_change_case() 0555 @INPUT : 0556 @OUTPUT : 0557 @RETURNS : 0558 @DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase, 0559 or "title capitalization"> 0560 @GLOBALS : 0561 @CALLS : 0562 @CALLERS : 0563 @CREATED : 1997/11/25, GPW 0564 @MODIFIED : 0565 -------------------------------------------------------------------------- */ 0566 void 0567 bt_change_case (char transform, 0568 char * string, 0569 ushort options) 0570 { 0571 int len; 0572 int depth; 0573 int src, dst; /* indices into string */ 0574 boolean start_sentence; 0575 boolean after_colon; 0576 0577 src = dst = 0; 0578 len = strlen (string); 0579 depth = 0; 0580 0581 start_sentence = TRUE; 0582 after_colon = FALSE; 0583 0584 while (string[src] != 0) 0585 { 0586 switch (string[src]) 0587 { 0588 case '{': 0589 0590 /* 0591 * At start of special character? The entire special char. 0592 * will be handled here, as follows: 0593 * - text at any brace-depth within the s.c. is case-mangled; 0594 * punctuation (sentence endings, colons) are ignored 0595 * - control sequences are left alone, unless they are 0596 * one of the "foreign letter" control sequences, in 0597 * which case they're converted to the appropriate string 0598 * according to the uc_version or lc_version tables. 0599 */ 0600 if (depth == 0 && string[src+1] == '\\') 0601 { 0602 convert_special_char (transform, string, &src, &dst, 0603 &start_sentence, &after_colon); 0604 } 0605 0606 /* 0607 * Otherwise, it's just something in braces. This is probably 0608 * a proper noun or something encased in braces to protect it 0609 * from case-mangling, so we do not case-mangle it. However, 0610 * we *do* switch out of start_sentence or after_colon mode if 0611 * we happen to be there (otherwise we'll do the wrong thing 0612 * once we're out of the braces). 0613 */ 0614 else 0615 { 0616 string[dst++] = string[src++]; 0617 start_sentence = after_colon = FALSE; 0618 depth++; 0619 } 0620 break; 0621 0622 case '}': 0623 string[dst++] = string[src++]; 0624 depth--; 0625 break; 0626 0627 /* 0628 * Sentence-ending punctuation and colons are handled separately 0629 * to allow for exact mimicing of BibTeX's behaviour. I happen 0630 * to think that this behaviour (capitalize first word of sentences 0631 * in a title) is better than BibTeX's, but I want to keep my 0632 * options open for a future goal of perfect compatibility. 0633 */ 0634 case '.': 0635 case '?': 0636 case '!': 0637 start_sentence = TRUE; 0638 string[dst++] = string[src++]; 0639 break; 0640 0641 case ':': 0642 after_colon = TRUE; 0643 string[dst++] = string[src++]; 0644 break; 0645 0646 default: 0647 if (isspace (string[src])) 0648 { 0649 string[dst++] = string[src++]; 0650 } 0651 else 0652 { 0653 if (depth == 0) 0654 { 0655 switch (transform) 0656 { 0657 case 'u': 0658 string[dst++] = toupper (string[src++]); 0659 break; 0660 case 'l': 0661 string[dst++] = tolower (string[src++]); 0662 break; 0663 case 't': 0664 if (start_sentence || after_colon) 0665 { 0666 /* 0667 * XXX BibTeX only preserves case of character 0668 * immediately after a colon; I do two things 0669 * differently: first, I pay attention to sentence 0670 * punctuation, and second I force uppercase 0671 * at start of sentence or after a colon. 0672 */ 0673 string[dst++] = toupper (string[src++]); 0674 start_sentence = after_colon = FALSE; 0675 } 0676 else 0677 { 0678 string[dst++] = tolower (string[src++]); 0679 } 0680 break; 0681 default: 0682 internal_error ("impossible case transform \"%c\"", 0683 transform); 0684 } 0685 } /* depth == 0 */ 0686 else 0687 { 0688 string[dst++] = string[src++]; 0689 } 0690 } /* not blank */ 0691 } /* switch on current character */ 0692 0693 } /* while not at end of string */ 0694 0695 } /* bt_change_case */