3rdparty/btparse/string_util.c

0001 /* ------------------------------------------------------------------------
0002 @NAME       : string_util.c
0003 @DESCRIPTION: Various string-processing utility functions:
0004                 bt_purify_string()
0005                 bt_change_case()
0006
0007               and their helpers:
0008                 foreign_letter()
0009                 purify_special_char()
0010 @GLOBALS    :
0011 @CALLS      :
0012 @CALLERS    :
0013 @CREATED    : 1997/10/19, Greg Ward
0014 @MODIFIED   : 1997/11/25, GPW: renamed to from purify.c to string_util.c
0015                                added bt_change_case() and friends
0016 @VERSION    : $Id: string_util.c,v 1.10 1999/10/28 22:50:28 greg Rel $
0017 -------------------------------------------------------------------------- */
0018
0019 #include <stdlib.h>
0020 #include <ctype.h>
0021 #include <string.h>
0022 #include <assert.h>
0023 #include "error.h"
0024 #include "btparse.h"
0025 #include "bt_debug.h"
0026
0027
0028 /*
0029  * These definitions should be fixed to be consistent with HTML
0030  * entities, just for fun.  And perhaps I should add entries for
0031  * accented letters (at least those supported by TeX and HTML).
0032  */
0033 typedef enum
0034 {
0035    L_OTHER,                             /* not a "foreign" letter */
0036    L_OSLASH_L,                          /* Eastern European {\o} */
0037    L_OSLASH_U,
0038    L_LSLASH_L,                          /* {\l} */
0039    L_LSLASH_U,
0040    L_OELIG_L,                           /* Latin {\oe} ligature */
0041    L_OELIG_U,
0042    L_AELIG_L,                           /* {\ae} ligature */
0043    L_AELIG_U,
0044    L_SSHARP_L,                          /* German "sharp s" {\ss} */
0045    L_SSHARP_U,
0046    L_ACIRCLE_L,                         /* Nordic {\aa} */
0047    L_ACIRCLE_U,
0048    L_INODOT_L,                          /* undotted i: {\i} */
0049    L_JNODOT_L                           /* {\j} */
0050 } bt_letter;
0051
0052
0053 static const char * uc_version[] =
0054 {
0055    NULL,                                /* L_OTHER */
0056    "\\O",                               /* L_OSLASH_L */
0057    "\\O",                               /* L_OSLASH_U */
0058    "\\L",                               /* L_LSLASH_L */
0059    "\\L",                               /* L_LSLASH_U */
0060    "\\OE",                              /* L_OELIG_L */
0061    "\\OE",                              /* L_OELIG_U */
0062    "\\AE",                              /* L_AELIG_L */
0063    "\\AE",                              /* L_AELIG_U */
0064    "SS",                                /* L_SSHARP_L -- for LaTeX 2.09 */
0065    "\\SS",                              /* L_SSHARP_U */
0066    "\\AA",                              /* L_ACIRCLE_L */
0067    "\\AA",                              /* L_ACIRCLE_U */
0068    "I",                                 /* L_INODOT_L */
0069    "J"                                  /* L_JNODOT_L */
0070 };
0071
0072 static const char * lc_version[] =
0073 {
0074    NULL,                                /* L_OTHER */
0075    "\\o",                               /* L_OSLASH_L */
0076    "\\o",                               /* L_OSLASH_U */
0077    "\\l",                               /* L_LSLASH_L */
0078    "\\l",                               /* L_LSLASH_U */
0079    "\\oe",                              /* L_OELIG_L */
0080    "\\oe",                              /* L_OELIG_U */
0081    "\\ae",                              /* L_AELIG_L */
0082    "\\ae",                              /* L_AELIG_U */
0083    "\\ss",                              /* L_SSHARP_L */
0084    "\\ss",                              /* L_SSHARP_U */
0085    "\\aa",                              /* L_ACIRCLE_L */
0086    "\\aa",                              /* L_ACIRCLE_U */
0087    "\\i",                               /* L_INODOT_L */
0088    "\\j"                                /* L_JNODOT_L */
0089 };
0090
0091
0092
0093 /* ------------------------------------------------------------------------
0094 @NAME       : foreign_letter()
0095 @INPUT      : str
0096               start
0097               stop
0098 @OUTPUT     : letter
0099 @RETURNS    : TRUE if the string delimited by start and stop is a foreign
0100               letter control sequence
0101 @DESCRIPTION: Determines if a character sequence is one of (La)TeX's
0102               "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus
0103               uppercase versions).  If `letter' is non-NULL, returns which
0104               letter was found in it (as a bt_letter value).
0105 @CALLS      :
0106 @CALLERS    : purify_special_char()
0107 @CREATED    : 1997/10/19, GPW
0108 @MODIFIED   :
0109 -------------------------------------------------------------------------- */
0110 static boolean
0111 foreign_letter (char *str, int start, int stop, bt_letter * letter)
0112 {
0113    char      c1, c2;
0114    bt_letter dummy;
0115
0116
0117    /*
0118     * This is written for speed, not flexibility -- adding new foreign
0119     * letters would be trying and vexatious.
0120     *
0121     * N.B. my gold standard list of foreign letters is Kopka and Daly's
0122     * *A Guide to LaTeX 2e*, section 2.5.6.
0123     */
0124
0125    if (letter == NULL)                  /* so we can assign to *letter */
0126       letter = &dummy;                  /* without compunctions */
0127    *letter = L_OTHER;                   /* assume not a "foreign" letter */
0128
0129    c1 = str[start+0];                   /* only two characters that we're */
0130    c2 = str[start+1];                   /* interested in */
0131
0132    switch (stop - start)
0133    {
0134       case 1:                           /* one-character control sequences */
0135          switch (c1)                    /* (\o and \l) */
0136          {
0137             case 'o':
0138                *letter = L_OSLASH_L; return TRUE;
0139             case 'O':
0140                *letter = L_OSLASH_U; return TRUE;
0141             case 'l':
0142                *letter = L_LSLASH_L; return TRUE;
0143             case 'L':
0144                *letter = L_LSLASH_L; return TRUE;
0145             case 'i':
0146                *letter = L_INODOT_L; return TRUE;
0147             case 'j':
0148                *letter = L_JNODOT_L; return TRUE;
0149             default:
0150                return FALSE;
0151          }
0152          break;
0153       case 2:                           /* two character control sequences */
0154          switch (c1)                    /* (\oe, \ae, \aa, and \ss) */
0155          {
0156             case 'o':
0157                if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; }
0158             case 'O':
0159                if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; }
0160
0161             /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/
0162             case 'a':
0163                if (c2 == 'e')
0164                   { *letter = L_AELIG_L; return TRUE; }
0165                else if (c2 == 'a')
0166                   { *letter = L_ACIRCLE_L; return TRUE; }
0167                else
0168                   return FALSE;
0169             case 'A':
0170                if (c2 == 'E')
0171                   { *letter = L_AELIG_U; return TRUE; }
0172                else if (c2 == 'A')
0173                   { *letter = L_ACIRCLE_U; return TRUE; }
0174                else
0175                   return FALSE;
0176
0177             /* uppercase sharp-s -- new with LaTeX 2e (so far all I do
0178              * is recognize it as a "foreign" letter)
0179              */
0180             case 's':
0181                if (c2 == 's')
0182                   { *letter = L_SSHARP_L; return TRUE; }
0183                else
0184                   return FALSE;
0185             case 'S':
0186                if (c2 == 'S')
0187                   { *letter = L_SSHARP_U; return TRUE; }
0188                else
0189                   return FALSE;
0190          }
0191          break;
0192       default:
0193          return FALSE;
0194    } /* switch on length of control sequence */
0195
0196    internal_error ("foreign_letter(): should never reach end of function");
0197    return FALSE;                        /* to keep gcc -Wall happy */
0198
0199 } /* foreign_letter */
0200
0201
0202 /* ------------------------------------------------------------------------
0203 @NAME       : purify_special_char()
0204 @INPUT      : *src, *dst - pointers into the input and output strings
0205 @OUTPUT     : *src       - updated to point to the closing brace of the
0206                            special char
0207               *dst       - updated to point to the next available spot
0208                            for copying text to
0209 @RETURNS    :
0210 @DESCRIPTION: "Purifies" a BibTeX special character.  On input, *src should
0211               point to the opening brace of a special character (ie. the
0212               brace must be at depth 0 of the whole string, and the
0213               character immediately following it must be a backslash).
0214               *dst should point to the next spot to copy into the output
0215               (purified) string.  purify_special_char() will skip over the
0216               opening brace and backslash; if the control sequence is one
0217               of LaTeX's foreign letter sequences (as determined by
0218               foreign_letter()), then it is simply copied to *dst.
0219               Otherwise the control sequence is skipped.  In either case,
0220               text after the control sequence is either copied (alphabetic
0221               characters) or skipped (anything else, including hyphens,
0222               ties, and digits).
0223 @CALLS      : foreign_letter()
0224 @CALLERS    : bt_purify_string()
0225 @CREATED    : 1997/10/19, GPW
0226 @MODIFIED   :
0227 -------------------------------------------------------------------------- */
0228 static void
0229 purify_special_char (char *str, int * src, int * dst)
0230 {
0231    int    depth;
0232    int    peek;
0233
0234    assert (str[*src] == '{' && str[*src + 1] == '\\');
0235    depth = 1;
0236
0237    *src += 2;                           /* jump to start of control sequence */
0238    peek = *src;                         /* scan to end of control sequence */
0239    while (isalpha (str[peek]))
0240       peek++;
0241    if (peek == *src)                    /* in case of single-char, non-alpha */
0242       peek++;                           /* control sequence (eg. {\'e}) */
0243
0244    if (foreign_letter (str, *src, peek, NULL))
0245    {
0246       assert (peek - *src == 1 || peek - *src == 2);
0247       str[(*dst)++] = str[(*src)++];    /* copy first char */
0248       if (*src < peek)                  /* copy second char, downcasing */
0249          str[(*dst)++] = tolower (str[(*src)++]);
0250    }
0251    else                                 /* not a foreign letter -- skip */
0252    {                                    /* the control sequence entirely */
0253       *src = peek;
0254    }
0255
0256    while (str[*src])
0257    {
0258       switch (str[*src])
0259       {
0260          case '{':
0261             depth++;
0262             (*src)++;
0263             break;
0264          case '}':
0265             depth--;
0266             if (depth == 0) return;     /* done with special char */
0267             (*src)++;
0268             break;
0269          default:
0270             if (isalpha (str[*src]))    /* copy alphabetic chars */
0271                str[(*dst)++] = str[(*src)++];
0272             else                        /* skip everything else */
0273                (*src)++;
0274       }
0275    }
0276
0277    /*
0278     * If we get here, we have unbalanced braces -- the '}' case should
0279     * always hit a depth == 0 point if braces are balanced.  No warning,
0280     * though, because a) BibTeX doesn't warn about purifying unbalanced
0281     * strings, and b) we (should have) already warned about it in the
0282     * lexer.
0283     */
0284
0285 } /* purify_special_char() */
0286
0287
0288 /* ------------------------------------------------------------------------
0289 @NAME       : bt_purify_string()
0290 @INOUT      : instr
0291 @INPUT      : options
0292 @OUTPUT     :
0293 @RETURNS    : instr   - same as input string, but modified in place
0294 @DESCRIPTION: "Purifies" a BibTeX string.  This consists of copying
0295               alphanumeric characters, converting hyphens and ties to
0296               space, copying spaces, and skipping everything else.  (Well,
0297               almost -- special characters are handled specially, of
0298               course.  Basically, accented letters have the control
0299               sequence skipped, while foreign letters have the control
0300               sequence preserved in a reasonable manner.  See
0301               purify_special_char() for details.)
0302 @CALLS      : purify_special_char()
0303 @CALLERS    :
0304 @CREATED    : 1997/10/19, GPW
0305 @MODIFIED   :
0306 -------------------------------------------------------------------------- */
0307 void
0308 bt_purify_string (char * string, ushort options)
0309 {
0310    int    src,                          /* both indices into string */
0311           dst;
0312    int    depth;                        /* brace depth in string */
0313    unsigned orig_len;
0314
0315    /*
0316     * Since purification always copies or deletes chars, outstr will
0317     * be no longer than string -- so nothing fancy is required to put
0318     * an upper bound on its eventual size.
0319     */
0320
0321    depth = 0;
0322    src = 0;
0323    dst = 0;
0324    orig_len = strlen (string);
0325
0326    DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n",
0327                           string, string));
0328
0329    while (string[src] != (char) 0)
0330    {
0331       DBG_ACTION (2, printf ("  next: >%c<: ", string[src]));
0332       switch (string[src])
0333       {
0334          case '~':                      /* "separator" characters -- */
0335          case '-':                      /* replaced with space */
0336          case ' ':                      /* and copy an actual space */
0337             string[dst++] = ' ';
0338             src++;
0339             DBG_ACTION (2, printf ("replacing with space"));
0340             break;
0341          case '{':
0342             if (depth == 0 && string[src+1] == '\\')
0343             {
0344                DBG_ACTION (2, printf ("special char found"));
0345                purify_special_char (string, &src, &dst);
0346             }
0347             else
0348             {
0349                DBG_ACTION (2, printf ("ordinary open brace"));
0350                src++;
0351             }
0352             depth++;
0353             break;
0354          case '}':
0355             DBG_ACTION (2, printf ("close brace"));
0356             depth--;
0357             src++;
0358             break;
0359          default:
0360             if (isalnum (string[src]))         /* any alphanumeric char -- */
0361             {
0362                DBG_ACTION (2, printf ("alphanumeric -- copying"));
0363                string[dst++] = string[src++]; /* copy it */
0364             }
0365             else                        /* anything else -- skip it */
0366             {
0367                DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha"));
0368                src++;
0369             }
0370       } /* switch string[src] */
0371
0372       DBG_ACTION (2, printf ("\n"));
0373
0374    } /* while string[src] */
0375
0376    DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth));
0377
0378    string[dst] = (char) 0;
0379    assert (strlen (string) <= orig_len);
0380 } /* bt_purify_string() */
0381
0382
0383 /* ======================================================================
0384  * Case-transformation stuff
0385  */
0386
0387
0388 /* ------------------------------------------------------------------------
0389 @NAME       : convert_special_char()
0390 @INPUT      : transform
0391 @INOUT      : string
0392               src
0393               dst
0394               start_sentence
0395               after_colon
0396 @RETURNS    :
0397 @DESCRIPTION: Does case conversion on a special character.
0398 @GLOBALS    :
0399 @CALLS      :
0400 @CALLERS    :
0401 @CREATED    : 1997/11/25, GPW
0402 @MODIFIED   :
0403 -------------------------------------------------------------------------- */
0404 static void
0405 convert_special_char (char transform,
0406                       char * string,
0407                       int * src,
0408                       int * dst,
0409                       boolean * start_sentence,
0410                       boolean * after_colon)
0411 {
0412    int       depth;
0413    boolean   done_special;
0414    int       cs_end;
0415    int       cs_len;                    /* counting the backslash */
0416    bt_letter letter;
0417    const char *    repl;
0418    int       repl_len;
0419
0420 #ifndef ALLOW_WARNINGS
0421    repl = NULL;                         /* silence "might be used" */
0422                                         /* uninitialized" warning */
0423 #endif
0424
0425    /* First, copy just the opening brace */
0426    string[(*dst)++] = string[(*src)++];
0427
0428    /*
0429     * Now loop over characters inside the braces -- stop when we reach
0430     * the matching close brace, or when the string ends.
0431     */
0432    depth = 1;                           /* because we're in a special char */
0433    done_special = FALSE;
0434
0435    while (string[*src] != 0 && !done_special)
0436    {
0437       switch (string[*src])
0438       {
0439          case '\\':                     /* a control sequence */
0440          {
0441             cs_end = *src+1;            /* scan over chars of c.s. */
0442             while (isalpha (string[cs_end]))
0443                cs_end++;
0444
0445             /*
0446              * OK, now *src points to the backslash (so src+*1 points to
0447              * first char. of control sequence), and cs_end points to
0448              * character immediately following end of control sequence.
0449              * Thus we analyze [*src+1..cs_end] to determine if the control
0450              * sequence is a foreign letter, and use (cs_end - (*src+1) + 1)
0451              * = (cs_end - *src) as the length of the control sequence.
0452              */
0453
0454             cs_len = cs_end - *src;     /* length of cs, counting backslash */
0455
0456             if (foreign_letter (string, *src+1, cs_end, &letter))
0457             {
0458                if (letter == L_OTHER)
0459                   internal_error ("impossible foreign letter");
0460
0461                switch (transform)
0462                {
0463                   case 'u':
0464                      repl = uc_version[(int) letter];
0465                      break;
0466                   case 'l':
0467                      repl = lc_version[(int) letter];
0468                      break;
0469                   case 't':
0470                      if (*start_sentence || *after_colon)
0471                      {
0472                         repl = uc_version[(int) letter];
0473                         *start_sentence = *after_colon = FALSE;
0474                      }
0475                      else
0476                      {
0477                         repl = lc_version[(int) letter];
0478                      }
0479                      break;
0480                   default:
0481                      internal_error ("impossible case transform \"%c\"",
0482                                      transform);
0483                }
0484
0485                repl_len = strlen (repl);
0486                if (repl_len > cs_len)
0487                   internal_error
0488                      ("replacement text longer than original cs");
0489
0490                strncpy (string + *dst, repl, repl_len);
0491                *src = cs_end;
0492                *dst += repl_len;
0493             } /* control sequence is a foreign letter */
0494             else
0495             {
0496                /* not a foreign letter -- just copy the control seq. as is */
0497
0498
0499                strncpy (string + *dst, string + *src, cs_end - *src);
0500                *src += cs_len;
0501                assert (*src == cs_end);
0502                *dst += cs_len;
0503             } /* control sequence not a foreign letter */
0504
0505             break;
0506          } /* case: '\\' */
0507
0508          case '{':
0509          {
0510             string[(*dst)++] = string[(*src)++];
0511             depth++;
0512             break;
0513          }
0514
0515          case '}':
0516          {
0517             string[(*dst)++] = string[(*src)++];
0518             depth--;
0519             if (depth == 0)
0520                done_special = TRUE;
0521             break;
0522          }
0523
0524          default:                       /* any other character */
0525          {
0526             switch (transform)
0527             {
0528                /*
0529                 * Inside special chars, lowercase and title caps are same.
0530                 * (At least, that's bibtex's convention.  I might change this
0531                 * at some point to be a bit smarter.)
0532                 */
0533                case 'l':
0534                case 't':
0535                   string[(*dst)++] = tolower (string[(*src)++]);
0536                   break;
0537                case 'u':
0538                   string[(*dst)++] = toupper (string[(*src)++]);
0539                   break;
0540                default:
0541                   internal_error ("impossible case transform \"%c\"",
0542                                   transform);
0543             }
0544          } /* default char */
0545
0546       } /* switch: current char */
0547
0548    } /* while: string or special char not done */
0549
0550 } /* convert_special_char() */
0551
0552
0553 /* ------------------------------------------------------------------------
0554 @NAME       : bt_change_case()
0555 @INPUT      :
0556 @OUTPUT     :
0557 @RETURNS    :
0558 @DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase,
0559               or "title capitalization">
0560 @GLOBALS    :
0561 @CALLS      :
0562 @CALLERS    :
0563 @CREATED    : 1997/11/25, GPW
0564 @MODIFIED   :
0565 -------------------------------------------------------------------------- */
0566 void
0567 bt_change_case (char   transform,
0568                 char * string,
0569                 ushort options)
0570 {
0571    int    len;
0572    int    depth;
0573    int    src, dst;                     /* indices into string */
0574    boolean start_sentence;
0575    boolean after_colon;
0576
0577    src = dst = 0;
0578    len = strlen (string);
0579    depth = 0;
0580
0581    start_sentence = TRUE;
0582    after_colon = FALSE;
0583
0584    while (string[src] != 0)
0585    {
0586       switch (string[src])
0587       {
0588          case '{':
0589
0590             /*
0591              * At start of special character?  The entire special char.
0592              * will be handled here, as follows:
0593              *   - text at any brace-depth within the s.c. is case-mangled;
0594              *     punctuation (sentence endings, colons) are ignored
0595              *   - control sequences are left alone, unless they are
0596              *     one of the "foreign letter" control sequences, in
0597              *     which case they're converted to the appropriate string
0598              *     according to the uc_version or lc_version tables.
0599              */
0600             if (depth == 0 && string[src+1] == '\\')
0601             {
0602                convert_special_char (transform, string, &src, &dst,
0603                                      &start_sentence, &after_colon);
0604             }
0605
0606             /*
0607              * Otherwise, it's just something in braces.  This is probably
0608              * a proper noun or something encased in braces to protect it
0609              * from case-mangling, so we do not case-mangle it.  However,
0610              * we *do* switch out of start_sentence or after_colon mode if
0611              * we happen to be there (otherwise we'll do the wrong thing
0612              * once we're out of the braces).
0613              */
0614             else
0615             {
0616                string[dst++] = string[src++];
0617                start_sentence = after_colon = FALSE;
0618                depth++;
0619             }
0620             break;
0621
0622          case '}':
0623             string[dst++] = string[src++];
0624             depth--;
0625             break;
0626
0627          /*
0628           * Sentence-ending punctuation and colons are handled separately
0629           * to allow for exact mimicing of BibTeX's behaviour.  I happen
0630           * to think that this behaviour (capitalize first word of sentences
0631           * in a title) is better than BibTeX's, but I want to keep my
0632           * options open for a future goal of perfect compatibility.
0633           */
0634          case '.':
0635          case '?':
0636          case '!':
0637             start_sentence = TRUE;
0638             string[dst++] = string[src++];
0639             break;
0640
0641          case ':':
0642             after_colon = TRUE;
0643             string[dst++] = string[src++];
0644             break;
0645
0646          default:
0647             if (isspace (string[src]))
0648             {
0649                string[dst++] = string[src++];
0650             }
0651             else
0652             {
0653                if (depth == 0)
0654                {
0655                   switch (transform)
0656                   {
0657                      case 'u':
0658                         string[dst++] = toupper (string[src++]);
0659                         break;
0660                      case 'l':
0661                         string[dst++] = tolower (string[src++]);
0662                         break;
0663                      case 't':
0664                         if (start_sentence || after_colon)
0665                         {
0666                            /*
0667                             * XXX BibTeX only preserves case of character
0668                             * immediately after a colon; I do two things
0669                             * differently: first, I pay attention to sentence
0670                             * punctuation, and second I force uppercase
0671                             * at start of sentence or after a colon.
0672                             */
0673                            string[dst++] = toupper (string[src++]);
0674                            start_sentence = after_colon = FALSE;
0675                         }
0676                         else
0677                         {
0678                            string[dst++] = tolower (string[src++]);
0679                         }
0680                         break;
0681                      default:
0682                         internal_error ("impossible case transform \"%c\"",
0683                                         transform);
0684                   }
0685                } /* depth == 0 */
0686                else
0687                {
0688                   string[dst++] = string[src++];
0689                }
0690             } /* not blank */
0691       } /* switch on current character */
0692
0693    } /* while not at end of string */
0694
0695 } /* bt_change_case */