kdiff3/src/gnudiff_io.cpp

0001 // clang-format off
0002 /* File I/O for GNU DIFF.
0003
0004     Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003, 2004, 2005.
0005     The original file was part of GNU DIFF.
0006
0007     Part of KDiff3 - Text Diff And Merge Tool
0008
0009     SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc.
0010     SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
0011     SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
0012     SPDX-License-Identifier: GPL-2.0-or-later
0013 */
0014 // clang-format on
0015
0016 #include "gnudiff_diff.h"
0017
0018 #include "Utils.h"
0019
0020 #include <stdlib.h>
0021 #include <type_traits>
0022
0023 /* Rotate an unsigned value to the left.  */
0024 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n)))
0025
0026 /* Given a hash value and a new character, return a new hash value.  */
0027 #define HASH(h, c) ((c) + ROL(h, 7))
0028
0029 /* The type of a hash value.  */
0030 typedef size_t hash_value;
0031 static_assert(std::is_unsigned<hash_value>::value, "hash_value must be signed.");
0032
0033 /* Lines are put into equivalence classes of lines that match in lines_differ.
0034    Each equivalence class is represented by one of these structures,
0035    but only while the classes are being computed.
0036    Afterward, each class is represented by a number.  */
0037 struct equivclass {
0038     GNULineRef next;   /* Next item in this bucket.  */
0039     hash_value hash;   /* Hash of lines in this class.  */
0040     const QChar *line; /* A line that fits this class.  */
0041     size_t length;     /* That line's length, not counting its newline.  */
0042 };
0043
0044 /* Hash-table: array of buckets, each being a chain of equivalence classes.
0045    buckets[-1] is reserved for incomplete lines.  */
0046 static GNULineRef *buckets;
0047
0048 /* Number of buckets in the hash table array, not counting buckets[-1].  */
0049 static size_t nbuckets;
0050
0051 /* Array in which the equivalence classes are allocated.
0052    The bucket-chains go through the elements in this array.
0053    The number of an equivalence class is its index in this array.  */
0054 static equivclass *equivs;
0055
0056 /* Index of first free element in the array `equivs'.  */
0057 static GNULineRef equivs_index;
0058
0059 /* Number of elements allocated in the array `equivs'.  */
0060 static GNULineRef equivs_alloc;
0061
0062 /* Check for binary files and compare them for exact identity.  */
0063
0064 /* Return 1 if BUF contains a non text character.
0065    SIZE is the number of characters in BUF.  */
0066
0067 #define binary_file_p(buf, size) (memchr(buf, 0, size) != 0)
0068
0069 /* Compare two lines (typically one from each input file)
0070    according to the command line options.
0071    For efficiency, this is invoked only when the lines do not match exactly
0072    but an option like -i might cause us to ignore the difference.
0073    Return nonzero if the lines differ.  */
0074
0075 bool GnuDiff::lines_differ(const QChar *s1, size_t len1, const QChar *s2, size_t len2)
0076 {
0077     const QChar *t1 = s1;
0078     const QChar *t2 = s2;
0079     const QChar *s1end = s1 + len1;
0080     const QChar *s2end = s2 + len2;
0081
0082     for(;; ++t1, ++t2)
0083     {
0084         /* Test for exact char equality first, since it's a common case.  */
0085         if(t1 != s1end && t2 != s2end && *t1 == *t2)
0086             continue;
0087         else
0088         {
0089             while(t1 != s1end &&
0090                   ((bIgnoreWhiteSpace && isspace(t1->unicode())) ||
0091                    (bIgnoreNumbers && (t1->isDigit() || *t1 == '-' || *t1 == '.'))))
0092             {
0093                 ++t1;
0094             }
0095
0096             while(t2 != s2end &&
0097                   ((bIgnoreWhiteSpace && isspace(t2->unicode())) ||
0098                    (bIgnoreNumbers && (t2->isDigit() || *t2 == '-' || *t2 == '.'))))
0099             {
0100                 ++t2;
0101             }
0102
0103             if(t1 != s1end && t2 != s2end)
0104             {
0105                 if(ignore_case)
0106                 { /* Lowercase comparison. */
0107                     if(t1->toLower() == t2->toLower())
0108                         continue;
0109                 }
0110                 else if(*t1 == *t2)
0111                     continue;
0112                 else
0113                     return true;
0114             }
0115             else if(t1 == s1end && t2 == s2end)
0116                 return false;
0117             else
0118                 return true;
0119         }
0120     }
0121     return false;
0122 }
0123
0124 /* Split the file into lines, simultaneously computing the equivalence
0125    class for each line.  */
0126
0127 void GnuDiff::find_and_hash_each_line(file_data *current)
0128 {
0129     hash_value h;
0130     const QChar *p = current->prefix_end;
0131     QChar c;
0132     GNULineRef i, *bucket;
0133     size_t length;
0134
0135     /* Cache often-used quantities in local variables to help the compiler.  */
0136     const QChar **linbuf = current->linbuf;
0137     GNULineRef alloc_lines = current->alloc_lines;
0138     GNULineRef line = 0;
0139     GNULineRef linbuf_base = current->linbuf_base;
0140     GNULineRef *cureqs = (GNULineRef *)xmalloc(alloc_lines * sizeof(*cureqs));
0141     equivclass *eqs = equivs;
0142     GNULineRef eqs_index = equivs_index;
0143     GNULineRef eqs_alloc = equivs_alloc;
0144     const QChar *suffix_begin = current->suffix_begin;
0145     const QChar *bufend = current->buffer + current->buffered;
0146     bool diff_length_compare_anyway =
0147         ignore_white_space != IGNORE_NO_WHITE_SPACE || bIgnoreNumbers;
0148     bool same_length_diff_contents_compare_anyway =
0149         diff_length_compare_anyway | ignore_case;
0150
0151     while(p < suffix_begin)
0152     {
0153         const QChar *ip = p;
0154
0155         h = 0;
0156
0157         /* Hash this line until we find a newline or bufend is reached.  */
0158         if(ignore_case)
0159             switch(ignore_white_space)
0160             {
0161                 case IGNORE_ALL_SPACE:
0162                     while(p < bufend && !Utils::isEndOfLine(c = *p))
0163                     {
0164                         if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.'))))
0165                             h = HASH(h, c.toLower().unicode());
0166                         ++p;
0167                     }
0168                     break;
0169
0170                 default:
0171                     while(p < bufend && !Utils::isEndOfLine(c = *p))
0172                     {
0173                         h = HASH(h, c.toLower().unicode());
0174                         ++p;
0175                     }
0176                     break;
0177             }
0178         else
0179             switch(ignore_white_space)
0180             {
0181                 case IGNORE_ALL_SPACE:
0182                     while(p < bufend && !Utils::isEndOfLine(c = *p))
0183                     {
0184                         if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.'))))
0185                             h = HASH(h, c.unicode());
0186                         ++p;
0187                     }
0188                     break;
0189
0190                 default:
0191                     while(p < bufend && !Utils::isEndOfLine(c = *p))
0192                     {
0193                         h = HASH(h, c.unicode());
0194                         ++p;
0195                     }
0196                     break;
0197             }
0198
0199         bucket = &buckets[h % nbuckets];
0200         length = p - ip;
0201         ++p;
0202
0203         for(i = *bucket;; i = eqs[i].next)
0204             if(!i)
0205             {
0206                 /* Create a new equivalence class in this bucket.  */
0207                 i = eqs_index++;
0208                 if(i == eqs_alloc)
0209                 {
0210                     if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(*eqs))) <= eqs_alloc)
0211                         xalloc_die();
0212                     eqs_alloc *= 2;
0213                     eqs = (equivclass *)xrealloc(eqs, eqs_alloc * sizeof(*eqs));
0214                 }
0215                 eqs[i].next = *bucket;
0216                 eqs[i].hash = h;
0217                 eqs[i].line = ip;
0218                 eqs[i].length = length;
0219                 *bucket = i;
0220                 break;
0221             }
0222             else if(eqs[i].hash == h)
0223             {
0224                 const QChar *eqline = eqs[i].line;
0225
0226                 /* Reuse existing class if lines_differ reports the lines
0227                equal.  */
0228                 if(eqs[i].length == length)
0229                 {
0230                     /* Reuse existing equivalence class if the lines are identical.
0231            This detects the common case of exact identity
0232            faster than lines_differ would.  */
0233                     if(memcmp(eqline, ip, length * sizeof(QChar)) == 0)
0234                         break;
0235                     if(!same_length_diff_contents_compare_anyway)
0236                         continue;
0237                 }
0238                 else if(!diff_length_compare_anyway)
0239                     continue;
0240
0241                 if(!lines_differ(eqline, eqs[i].length, ip, length))
0242                     break;
0243             }
0244
0245         /* Maybe increase the size of the line table.  */
0246         if(line == alloc_lines)
0247         {
0248             /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
0249             if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base)
0250                 xalloc_die();
0251             alloc_lines = 2 * alloc_lines - linbuf_base;
0252             cureqs = (GNULineRef *)xrealloc(cureqs, alloc_lines * sizeof(*cureqs));
0253             linbuf += linbuf_base;
0254             linbuf = (const QChar **)xrealloc(linbuf,
0255                                               (alloc_lines - linbuf_base) * sizeof(ptrdiff_t));
0256             linbuf -= linbuf_base;
0257         }
0258         linbuf[line] = ip;
0259         cureqs[line] = i;
0260         ++line;
0261     }
0262
0263     current->buffered_lines = line;
0264
0265     for(i = 0;; ++i)
0266     {
0267         /* Record the line start for lines in the suffix that we care about.
0268      Record one more line start than lines,
0269      so that we can compute the length of any buffered line.  */
0270         if(line == alloc_lines)
0271         {
0272             /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
0273             if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base)
0274                 xalloc_die();
0275             alloc_lines = 2 * alloc_lines - linbuf_base;
0276             linbuf += linbuf_base;
0277             linbuf = (const QChar **)xrealloc(linbuf,
0278                                               (alloc_lines - linbuf_base) * sizeof(ptrdiff_t));
0279             linbuf -= linbuf_base;
0280         }
0281         linbuf[line] = p;
0282
0283         if(p >= bufend)
0284             break;
0285
0286         if(context <= i && no_diff_means_no_output)
0287             break;
0288
0289         line++;
0290
0291         while(p < bufend && !Utils::isEndOfLine(*p++))
0292             continue;
0293     }
0294
0295     /* Done with cache in local variables.  */
0296     current->linbuf = linbuf;
0297     current->valid_lines = line;
0298     current->alloc_lines = alloc_lines;
0299     current->equivs = cureqs;
0300     equivs = eqs;
0301     equivs_alloc = eqs_alloc;
0302     equivs_index = eqs_index;
0303 }
0304
0305 /* We have found N lines in a buffer of size S; guess the
0306    proportionate number of lines that will be found in a buffer of
0307    size T.  However, do not guess a number of lines so large that the
0308    resulting line table might cause overflow in size calculations.  */
0309 GNULineRef
0310 GnuDiff::guess_lines(GNULineRef n, size_t s, size_t t)
0311 {
0312     size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
0313     size_t guessed_lines = std::max((size_t)1, t / guessed_bytes_per_line);
0314     return (GNULineRef)std::min((GNULineRef)guessed_lines, (GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t) + 1) - 5)) + 5;
0315 }
0316
0317 /* Given a vector of two file_data objects, find the identical
0318    prefixes and suffixes of each object.  */
0319
0320 void GnuDiff::find_identical_ends(file_data filevec[])
0321 {
0322     /* Find identical prefix.  */
0323     const QChar *p0, *p1, *buffer0, *buffer1;
0324     p0 = buffer0 = filevec[0].buffer;
0325     p1 = buffer1 = filevec[1].buffer;
0326     size_t n0, n1;
0327     n0 = filevec[0].buffered;
0328     n1 = filevec[1].buffered;
0329     const QChar *const pEnd0 = p0 + n0;
0330     const QChar *const pEnd1 = p1 + n1;
0331
0332     if(p0 == p1)
0333         /* The buffers are the same; sentinels won't work.  */
0334         p0 = p1 += n1;
0335     else
0336     {
0337         /* Loop until first mismatch, or end. */
0338         while(p0 != pEnd0 && p1 != pEnd1 && *p0 == *p1)
0339         {
0340             p0++;
0341             p1++;
0342         }
0343     }
0344
0345     /* Now P0 and P1 point at the first nonmatching characters.  */
0346
0347     /* Skip back to last line-beginning in the prefix. */
0348     while(p0 != buffer0 && !Utils::isEndOfLine(p0[-1]))
0349         p0--, p1--;
0350
0351     /* Record the prefix.  */
0352     filevec[0].prefix_end = p0;
0353     filevec[1].prefix_end = p1;
0354
0355     /* Find identical suffix.  */
0356
0357     /* P0 and P1 point beyond the last chars not yet compared.  */
0358     p0 = buffer0 + n0;
0359     p1 = buffer1 + n1;
0360
0361     const QChar *end0, *beg0;
0362     end0 = p0; /* Addr of last char in file 0.  */
0363
0364     /* Get value of P0 at which we should stop scanning backward:
0365       this is when either P0 or P1 points just past the last char
0366       of the identical prefix.  */
0367     beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
0368
0369     /* Scan back until chars don't match or we reach that point.  */
0370     for(; p0 != beg0; p0--, p1--)
0371     {
0372         if(*p0 != *p1)
0373         {
0374             /* Point at the first char of the matching suffix.  */
0375             beg0 = p0;
0376             break;
0377         }
0378     }
0379
0380     // Go to the next line (skip last line with a difference)
0381     if(p0 != end0)
0382     {
0383         if(*p0 != *p1)
0384             ++p0;
0385         while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++))
0386             continue;
0387     }
0388
0389     p1 += p0 - beg0;
0390
0391     /* Record the suffix.  */
0392     filevec[0].suffix_begin = p0;
0393     filevec[1].suffix_begin = p1;
0394
0395     /* Calculate number of lines of prefix to save.
0396
0397      prefix_count == 0 means save the whole prefix;
0398      we need this for options like -D that output the whole file,
0399      or for enormous contexts (to avoid worrying about arithmetic overflow).
0400      We also need it for options like -F that output some preceding line;
0401      at least we will need to find the last few lines,
0402      but since we don't know how many, it's easiest to find them all.
0403
0404      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
0405      of the line buffer; they'll be moved to the proper location later.
0406      Handle 1 more line than the context says (because we count 1 too many),
0407      rounded up to the next power of 2 to speed index computation.  */
0408
0409     const QChar **linbuf0, **linbuf1;
0410     GNULineRef alloc_lines0, alloc_lines1;
0411     GNULineRef buffered_prefix, prefix_count, prefix_mask;
0412     GNULineRef middle_guess, suffix_guess;
0413     if(no_diff_means_no_output && context < (GNULineRef)(GNULINEREF_MAX / 4) && context < (GNULineRef)(n0))
0414     {
0415         middle_guess = guess_lines(0, 0, p0 - filevec[0].prefix_end);
0416         suffix_guess = guess_lines(0, 0, buffer0 + n0 - p0);
0417         for(prefix_count = 1; prefix_count <= context; prefix_count *= 2)
0418             continue;
0419         alloc_lines0 = (prefix_count + middle_guess + std::min(context, suffix_guess));
0420     }
0421     else
0422     {
0423         prefix_count = 0;
0424         alloc_lines0 = guess_lines(0, 0, n0);
0425     }
0426
0427     prefix_mask = prefix_count - 1;
0428     GNULineRef lines = 0;
0429     linbuf0 = (const QChar **)xmalloc(alloc_lines0 * sizeof(ptrdiff_t));
0430     p0 = buffer0;
0431
0432     /* If the prefix is needed, find the prefix lines.  */
0433     if(!(no_diff_means_no_output && filevec[0].prefix_end == p0 && filevec[1].prefix_end == p1))
0434     {
0435         end0 = filevec[0].prefix_end;
0436         while(p0 != end0)
0437         {
0438             GNULineRef l = lines++ & prefix_mask;
0439             if(l == alloc_lines0)
0440             {
0441                 if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t))) <= alloc_lines0)
0442                     xalloc_die();
0443                 alloc_lines0 *= 2;
0444                 linbuf0 = (const QChar **)xrealloc(linbuf0, alloc_lines0 * sizeof(ptrdiff_t));
0445             }
0446             linbuf0[l] = p0;
0447             while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++))
0448                 continue;
0449         }
0450     }
0451     buffered_prefix = prefix_count && context < lines ? context : lines;
0452
0453     /* Allocate line buffer 1.  */
0454
0455     middle_guess = guess_lines(lines, p0 - buffer0, p1 - filevec[1].prefix_end);
0456     suffix_guess = guess_lines(lines, p0 - buffer0, buffer1 + n1 - p1);
0457     alloc_lines1 = buffered_prefix + middle_guess + std::min(context, suffix_guess);
0458     if(alloc_lines1 < buffered_prefix || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines1)
0459         xalloc_die();
0460     linbuf1 = (const QChar **)xmalloc(alloc_lines1 * sizeof(ptrdiff_t));
0461
0462     GNULineRef i;
0463     if(buffered_prefix != lines)
0464     {
0465         /* Rotate prefix lines to proper location.  */
0466         for(i = 0; i < buffered_prefix; ++i)
0467             linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
0468         for(i = 0; i < buffered_prefix; ++i)
0469             linbuf0[i] = linbuf1[i];
0470     }
0471
0472     /* Initialize line buffer 1 from line buffer 0.  */
0473     for(i = 0; i < buffered_prefix; ++i)
0474         linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
0475
0476     /* Record the line buffer, adjusted so that
0477      linbuf[0] points at the first differing line.  */
0478     filevec[0].linbuf = linbuf0 + buffered_prefix;
0479     filevec[1].linbuf = linbuf1 + buffered_prefix;
0480     filevec[0].linbuf_base = filevec[1].linbuf_base = -buffered_prefix;
0481     filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
0482     filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
0483     filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
0484 }
0485
0486 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
0487    than 2**k.  This table is derived from Chris K. Caldwell's list
0488    <https://www.utm.edu/research/primes/lists/2small/>.  */
0489
0490 static unsigned char const prime_offset[] =
0491     {
0492         0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
0493         15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
0494         11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
0495         55, 93, 1, 57, 25};
0496
0497 /* Verify that this host's size_t is not too wide for the above table.  */
0498
0499 static_assert(sizeof(size_t) * CHAR_BIT <= sizeof prime_offset, "Not enough primes in table");
0500
0501 /* Given a vector of two file_data objects, read the file associated
0502    with each one, and build the table of equivalence classes.
0503    Return nonzero if either file appears to be a binary file.
0504    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
0505
0506 bool GnuDiff::read_files(file_data filevec[], bool /*pretend_binary*/)
0507 {
0508     GNULineRef i;
0509
0510     find_identical_ends(filevec);
0511
0512     equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
0513     if((GNULineRef)(GNULINEREF_MAX / sizeof(*equivs)) <= equivs_alloc)
0514         xalloc_die();
0515     equivs = (equivclass *)xmalloc(equivs_alloc * sizeof(*equivs));
0516     /* Equivalence class 0 is permanently safe for lines that were not
0517      hashed.  Real equivalence classes start at 1.  */
0518     equivs_index = 1;
0519
0520     /* Allocate (one plus) a prime number of hash buckets.  Use a prime
0521      number between 1/3 and 2/3 of the value of equiv_allocs,
0522      approximately.  */
0523     for(i = 9; ((GNULineRef)1 << i) < equivs_alloc / 3; ++i)
0524         continue;
0525     nbuckets = ((GNULineRef)1 << i) - prime_offset[i];
0526     if(GNULINEREF_MAX / sizeof(*buckets) <= nbuckets)
0527         xalloc_die();
0528     buckets = (GNULineRef *)zalloc((nbuckets + 1) * sizeof(*buckets));
0529     buckets++;
0530
0531     for(i = 0; i < 2; ++i)
0532         find_and_hash_each_line(&filevec[i]);
0533
0534     filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
0535
0536     free(equivs);
0537     free(buckets - 1);
0538
0539     return false;
0540 }