kdiff3/src/gnudiff_analyze.cpp

0001 // clang-format off
0002 /* Analyze file differences for GNU DIFF.
0003
0004    Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003.
0005    The original file was part of GNU DIFF.
0006
0007
0008     Part of KDiff3 - Text Diff And Merge Tool
0009
0010     SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc.
0011     SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
0012     SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
0013     SPDX-License-Identifier: GPL-2.0-or-later
0014 */
0015
0016 /* The basic algorithm is described in:
0017    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
0018    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
0019    see especially section 4.2, which describes the variation used below.
0020    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
0021    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
0022    at the price of producing suboptimal output for large inputs with
0023    many differences.
0024
0025    The basic algorithm was independently discovered as described in:
0026    "Algorithms for Approximate String Matching", E. Ukkonen,
0027    Information and Control Vol. 64, 1985, pp. 100-118.  */
0028 // clang-format on
0029
0030 #define GDIFF_MAIN
0031
0032 #include "gnudiff_diff.h"
0033
0034 #include <algorithm>       // for max, min
0035 #include <stdlib.h>
0036
0037
0038 static GNULineRef *xvec, *yvec;  /* Vectors being compared. */
0039 static GNULineRef *fdiag;        /* Vector, indexed by diagonal, containing
0040                    1 + the X coordinate of the point furthest
0041                    along the given diagonal in the forward
0042                    search of the edit matrix. */
0043 static GNULineRef *bdiag;        /* Vector, indexed by diagonal, containing
0044                    the X coordinate of the point furthest
0045                    along the given diagonal in the backward
0046                    search of the edit matrix. */
0047 static GNULineRef too_expensive; /* Edit scripts longer than this are too
0048                    expensive to compute.  */
0049
0050 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'.  */
0051
0052 struct partition {
0053     GNULineRef xmid, ymid; /* Midpoints of this partition.  */
0054     bool lo_minimal;       /* Nonzero if low half will be analyzed minimally.  */
0055     bool hi_minimal;       /* Likewise for high half.  */
0056 };
0057
0058 /* Find the midpoint of the shortest edit script for a specified
0059    portion of the two files.
0060
0061    Scan from the beginnings of the files, and simultaneously from the ends,
0062    doing a breadth-first search through the space of edit-sequence.
0063    When the two searches meet, we have found the midpoint of the shortest
0064    edit sequence.
0065
0066    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
0067    of expense.  Otherwise, if the search is too expensive, use
0068    heuristics to stop the search and report a suboptimal answer.
0069
0070    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
0071    XMID - YMID equals the number of inserted lines minus the number
0072    of deleted lines (counting only lines before the midpoint).
0073    Return the approximate edit cost; this is the total number of
0074    lines inserted or deleted (counting only lines before the midpoint),
0075    unless a heuristic is used to terminate the search prematurely.
0076
0077    Set PART->lo_minimal to true iff the minimal edit script for the
0078    left half of the partition is known; similarly for PART->hi_minimal.
0079
0080    This function assumes that the first lines of the specified portions
0081    of the two files do not match, and likewise that the last lines do not
0082    match.  The caller must trim matching lines from the beginning and end
0083    of the portions it is going to specify.
0084
0085    If we return the "wrong" partitions,
0086    the worst this can do is cause suboptimal diff output.
0087    It cannot cause incorrect diff output.  */
0088
0089 GNULineRef GnuDiff::diag(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal,
0090                          partition *part) const
0091 {
0092     GNULineRef *const fd = fdiag;        /* Give the compiler a chance. */
0093     GNULineRef *const bd = bdiag;        /* Additional help for the compiler. */
0094     GNULineRef const *const xv = xvec;   /* Still more help for the compiler. */
0095     GNULineRef const *const yv = yvec;   /* And more and more . . . */
0096     GNULineRef const dmin = xoff - ylim; /* Minimum valid diagonal. */
0097     GNULineRef const dmax = xlim - yoff; /* Maximum valid diagonal. */
0098     GNULineRef const fmid = xoff - yoff; /* Center diagonal of top-down search. */
0099     GNULineRef const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
0100     GNULineRef fmin = fmid, fmax = fmid; /* Limits of top-down search. */
0101     GNULineRef bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
0102     GNULineRef c;                        /* Cost. */
0103     bool odd = (fmid - bmid) & 1;        /* True if southeast corner is on an odd
0104                    diagonal with respect to the northwest. */
0105
0106     fd[fmid] = xoff;
0107     bd[bmid] = xlim;
0108
0109     for(c = 1;; ++c)
0110     {
0111         GNULineRef d; /* Active diagonal. */
0112         bool big_snake = false;
0113
0114         /* Extend the top-down search by an edit step in each diagonal. */
0115         fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
0116         fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
0117         for(d = fmax; d >= fmin; d -= 2)
0118         {
0119             GNULineRef x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
0120
0121             if(tlo >= thi)
0122                 x = tlo + 1;
0123             else
0124                 x = thi;
0125             oldx = x;
0126             y = x - d;
0127             while(x < xlim && y < ylim && xv[x] == yv[y])
0128                 ++x, ++y;
0129             if(x - oldx > SNAKE_LIMIT)
0130                 big_snake = true;
0131             fd[d] = x;
0132             if(odd && bmin <= d && d <= bmax && bd[d] <= x)
0133             {
0134                 part->xmid = x;
0135                 part->ymid = y;
0136                 part->lo_minimal = part->hi_minimal = true;
0137                 return 2 * c - 1;
0138             }
0139         }
0140
0141         /* Similarly extend the bottom-up search.  */
0142         bmin > dmin ? bd[--bmin - 1] = GNULINEREF_MAX : ++bmin;
0143         bmax < dmax ? bd[++bmax + 1] = GNULINEREF_MAX : --bmax;
0144         for(d = bmax; d >= bmin; d -= 2)
0145         {
0146             GNULineRef x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
0147
0148             if(tlo < thi)
0149                 x = tlo;
0150             else
0151                 x = thi - 1;
0152             oldx = x;
0153             y = x - d;
0154             while(x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
0155                 --x, --y;
0156             if(oldx - x > SNAKE_LIMIT)
0157                 big_snake = true;
0158             bd[d] = x;
0159             if(!odd && fmin <= d && d <= fmax && x <= fd[d])
0160             {
0161                 part->xmid = x;
0162                 part->ymid = y;
0163                 part->lo_minimal = part->hi_minimal = true;
0164                 return 2 * c;
0165             }
0166         }
0167
0168         if(find_minimal)
0169             continue;
0170
0171         /* Heuristic: check occasionally for a diagonal that has made
0172      lots of progress compared with the edit distance.
0173      If we have any such, find the one that has made the most
0174      progress and return it as if it had succeeded.
0175
0176      With this heuristic, for files with a constant small density
0177      of changes, the algorithm is linear in the file size.  */
0178
0179         if(200 < c && big_snake && speed_large_files)
0180         {
0181             GNULineRef best;
0182
0183             best = 0;
0184             for(d = fmax; d >= fmin; d -= 2)
0185             {
0186                 GNULineRef dd = d - fmid;
0187                 GNULineRef x = fd[d];
0188                 GNULineRef y = x - d;
0189                 GNULineRef v = (x - xoff) * 2 - dd;
0190                 if(v > 12 * (c + (dd < 0 ? -dd : dd)))
0191                 {
0192                     if(v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim)
0193                     {
0194                         /* We have a good enough best diagonal;
0195              now insist that it end with a significant snake.  */
0196                         qint32 k;
0197
0198                         for(k = 1; xv[x - k] == yv[y - k]; k++)
0199                             if(k == SNAKE_LIMIT)
0200                             {
0201                                 best = v;
0202                                 part->xmid = x;
0203                                 part->ymid = y;
0204                                 break;
0205                             }
0206                     }
0207                 }
0208             }
0209             if(best > 0)
0210             {
0211                 part->lo_minimal = true;
0212                 part->hi_minimal = false;
0213                 return 2 * c - 1;
0214             }
0215
0216             best = 0;
0217             for(d = bmax; d >= bmin; d -= 2)
0218             {
0219                 GNULineRef dd = d - bmid;
0220                 GNULineRef x = bd[d];
0221                 GNULineRef y = x - d;
0222                 GNULineRef v = (xlim - x) * 2 + dd;
0223                 if(v > 12 * (c + (dd < 0 ? -dd : dd)))
0224                 {
0225                     if(v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT)
0226                     {
0227                         /* We have a good enough best diagonal;
0228              now insist that it end with a significant snake.  */
0229                         qint32 k;
0230
0231                         for(k = 0; xv[x + k] == yv[y + k]; k++)
0232                             if(k == SNAKE_LIMIT - 1)
0233                             {
0234                                 best = v;
0235                                 part->xmid = x;
0236                                 part->ymid = y;
0237                                 break;
0238                             }
0239                     }
0240                 }
0241             }
0242             if(best > 0)
0243             {
0244                 part->lo_minimal = false;
0245                 part->hi_minimal = true;
0246                 return 2 * c - 1;
0247             }
0248         }
0249
0250         /* Heuristic: if we've gone well beyond the call of duty,
0251      give up and report halfway between our best results so far.  */
0252         if(c >= too_expensive)
0253         {
0254             GNULineRef fxybest, fxbest;
0255             GNULineRef bxybest, bxbest;
0256
0257             fxbest = bxbest = 0; /* Pacify `gcc -Wall'.  */
0258
0259             /* Find forward diagonal that maximizes X + Y.  */
0260             fxybest = -1;
0261             for(d = fmax; d >= fmin; d -= 2)
0262             {
0263                 GNULineRef x = std::min(fd[d], xlim);
0264                 GNULineRef y = x - d;
0265                 if(ylim < y)
0266                     x = ylim + d, y = ylim;
0267                 if(fxybest < x + y)
0268                 {
0269                     fxybest = x + y;
0270                     fxbest = x;
0271                 }
0272             }
0273
0274             /* Find backward diagonal that minimizes X + Y.  */
0275             bxybest = GNULINEREF_MAX;
0276             for(d = bmax; d >= bmin; d -= 2)
0277             {
0278                 GNULineRef x = std::max(xoff, bd[d]);
0279                 GNULineRef y = x - d;
0280                 if(y < yoff)
0281                     x = yoff + d, y = yoff;
0282                 if(x + y < bxybest)
0283                 {
0284                     bxybest = x + y;
0285                     bxbest = x;
0286                 }
0287             }
0288
0289             /* Use the better of the two diagonals.  */
0290             if((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
0291             {
0292                 part->xmid = fxbest;
0293                 part->ymid = fxybest - fxbest;
0294                 part->lo_minimal = true;
0295                 part->hi_minimal = false;
0296             }
0297             else
0298             {
0299                 part->xmid = bxbest;
0300                 part->ymid = bxybest - bxbest;
0301                 part->lo_minimal = false;
0302                 part->hi_minimal = true;
0303             }
0304             return 2 * c - 1;
0305         }
0306     }
0307 }
0308
0309 /* Compare in detail contiguous subsequences of the two files
0310    which are known, as a whole, to match each other.
0311
0312    The results are recorded in the vectors files[N].changed, by
0313    storing 1 in the element for each line that is an insertion or deletion.
0314
0315    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
0316
0317    Note that XLIM, YLIM are exclusive bounds.
0318    All line numbers are origin-0 and discarded lines are not counted.
0319
0320    If FIND_MINIMAL, find a minimal difference no matter how
0321    expensive it is.  */
0322
0323 void GnuDiff::compareseq(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal)
0324 {
0325     GNULineRef *const xv = xvec; /* Help the compiler.  */
0326     GNULineRef *const yv = yvec;
0327
0328     /* Slide down the bottom initial diagonal. */
0329     while(xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
0330         ++xoff, ++yoff;
0331     /* Slide up the top initial diagonal. */
0332     while(xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
0333         --xlim, --ylim;
0334
0335     /* Handle simple cases. */
0336     if(xoff == xlim)
0337         while(yoff < ylim)
0338             files[1].changed[files[1].realindexes[yoff++]] = true;
0339     else if(yoff == ylim)
0340         while(xoff < xlim)
0341             files[0].changed[files[0].realindexes[xoff++]] = true;
0342     else
0343     {
0344         GNULineRef c;
0345         partition part;
0346
0347         /* Find a point of correspondence in the middle of the files.  */
0348
0349         c = diag(xoff, xlim, yoff, ylim, find_minimal, &part);
0350
0351         /* This should be impossible, because it implies that
0352          one of the two subsequences is empty,
0353          and that case was handled above without calling `diag'. */
0354         assert(c != 1);
0355
0356         /* Use the partitions to split this problem into subproblems.  */
0357         compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
0358         compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
0359     }
0360 }
0361
0362 /* Discard lines from one file that have no matches in the other file.
0363
0364    A line which is discarded will not be considered by the actual
0365    comparison algorithm; it will be as if that line were not in the file.
0366    The file's `realindexes' table maps virtual line numbers
0367    (which don't count the discarded lines) into real line numbers;
0368    this is how the actual comparison algorithm produces results
0369    that are comprehensible when the discarded lines are counted.
0370
0371    When we discard a line, we also mark it as a deletion or insertion
0372    so that it will be printed in the output.  */
0373
0374 void GnuDiff::discard_confusing_lines(file_data filevec[])
0375 {
0376     qint32 f;
0377     GNULineRef i;
0378     char *discarded[2];
0379     GNULineRef *equiv_count[2];
0380     GNULineRef *p;
0381
0382     /* Allocate our results.  */
0383     p = (GNULineRef *)xmalloc((filevec[0].buffered_lines + filevec[1].buffered_lines) * (2 * sizeof(*p)));
0384     for(f = 0; f < 2; ++f)
0385     {
0386         filevec[f].undiscarded = p;
0387         p += filevec[f].buffered_lines;
0388         filevec[f].realindexes = p;
0389         p += filevec[f].buffered_lines;
0390     }
0391
0392     /* Set up equiv_count[F][I] as the number of lines in file F
0393      that fall in equivalence class I.  */
0394
0395     p = (GNULineRef *)zalloc(filevec[0].equiv_max * (2 * sizeof(*p)));
0396     equiv_count[0] = p;
0397     equiv_count[1] = p + filevec[0].equiv_max;
0398
0399     for(i = 0; i < filevec[0].buffered_lines; ++i)
0400         ++equiv_count[0][filevec[0].equivs[i]];
0401     for(i = 0; i < filevec[1].buffered_lines; ++i)
0402         ++equiv_count[1][filevec[1].equivs[i]];
0403
0404     /* Set up tables of which lines are going to be discarded.  */
0405
0406     discarded[0] = (char *)zalloc(filevec[0].buffered_lines + filevec[1].buffered_lines);
0407     discarded[1] = discarded[0] + filevec[0].buffered_lines;
0408
0409     /* Mark to be discarded each line that matches no line of the other file.
0410      If a line matches many lines, mark it as provisionally discardable.  */
0411
0412     for(f = 0; f < 2; ++f)
0413     {
0414         size_t end = filevec[f].buffered_lines;
0415         char *discards = discarded[f];
0416         GNULineRef *counts = equiv_count[1 - f];
0417         GNULineRef *equivs = filevec[f].equivs;
0418         size_t many = 5;
0419         size_t tem = end / 64;
0420
0421         /* Multiply MANY by approximate square root of number of lines.
0422      That is the threshold for provisionally discardable lines.  */
0423         while((tem = tem >> 2) > 0)
0424             many *= 2;
0425
0426         for(i = 0; i < (GNULineRef)end; ++i)
0427         {
0428             GNULineRef nmatch;
0429             if(equivs[i] == 0)
0430                 continue;
0431             nmatch = counts[equivs[i]];
0432             if(nmatch == 0)
0433                 discards[i] = 1;
0434             else if(nmatch > (GNULineRef)many)
0435                 discards[i] = 2;
0436         }
0437     }
0438
0439     /* Don't really discard the provisional lines except when they occur
0440      in a run of discardables, with nonprovisionals at the beginning
0441      and end.  */
0442
0443     for(f = 0; f < 2; ++f)
0444     {
0445         GNULineRef end = filevec[f].buffered_lines;
0446         char *discards = discarded[f];
0447
0448         for(i = 0; i < end; ++i)
0449         {
0450             /* Cancel provisional discards not in middle of run of discards.  */
0451             if(discards[i] == 2)
0452                 discards[i] = 0;
0453             else if(discards[i] != 0)
0454             {
0455                 /* We have found a nonprovisional discard.  */
0456                 GNULineRef j;
0457                 GNULineRef length;
0458                 GNULineRef provisional = 0;
0459
0460                 /* Find end of this run of discardable lines.
0461          Count how many are provisionally discardable.  */
0462                 for(j = i; j < end; ++j)
0463                 {
0464                     if(discards[j] == 0)
0465                         break;
0466                     if(discards[j] == 2)
0467                         ++provisional;
0468                 }
0469
0470                 /* Cancel provisional discards at end, and shrink the run.  */
0471                 while(j > i && discards[j - 1] == 2)
0472                     discards[--j] = 0, --provisional;
0473
0474                 /* Now we have the length of a run of discardable lines
0475          whose first and last are not provisional.  */
0476                 length = j - i;
0477
0478                 /* If 1/4 of the lines in the run are provisional,
0479          cancel discarding of all provisional lines in the run.  */
0480                 if(provisional * 4 > length)
0481                 {
0482                     while(j > i)
0483                         if(discards[--j] == 2)
0484                             discards[j] = 0;
0485                 }
0486                 else
0487                 {
0488                     GNULineRef consec;
0489                     GNULineRef minimum = 1;
0490                     GNULineRef tem = length >> 2;
0491
0492                     /* MINIMUM is approximate square root of LENGTH/4.
0493              A subrun of two or more provisionals can stand
0494              when LENGTH is at least 16.
0495              A subrun of 4 or more can stand when LENGTH >= 64.  */
0496                     while(0 < (tem >>= 2))
0497                         minimum <<= 1;
0498                     minimum++;
0499
0500                     /* Cancel any subrun of MINIMUM or more provisionals
0501              within the larger run.  */
0502                     for(j = 0, consec = 0; j < length; ++j)
0503                         if(discards[i + j] != 2)
0504                             consec = 0;
0505                         else if(minimum == ++consec)
0506                             /* Back up to start of subrun, to cancel it all.  */
0507                             j -= consec;
0508                         else if(minimum < consec)
0509                             discards[i + j] = 0;
0510
0511                     /* Scan from beginning of run
0512              until we find 3 or more nonprovisionals in a row
0513              or until the first nonprovisional at least 8 lines in.
0514              Until that point, cancel any provisionals.  */
0515                     for(j = 0, consec = 0; j < length; ++j)
0516                     {
0517                         if(j >= 8 && discards[i + j] == 1)
0518                             break;
0519                         if(discards[i + j] == 2)
0520                             consec = 0, discards[i + j] = 0;
0521                         else if(discards[i + j] == 0)
0522                             consec = 0;
0523                         else
0524                             consec++;
0525                         if(consec == 3)
0526                             break;
0527                     }
0528
0529                     /* I advances to the last line of the run.  */
0530                     i += length - 1;
0531
0532                     /* Same thing, from end.  */
0533                     for(j = 0, consec = 0; j < length; ++j)
0534                     {
0535                         if(j >= 8 && discards[i - j] == 1)
0536                             break;
0537                         if(discards[i - j] == 2)
0538                             consec = 0, discards[i - j] = 0;
0539                         else if(discards[i - j] == 0)
0540                             consec = 0;
0541                         else
0542                             consec++;
0543                         if(consec == 3)
0544                             break;
0545                     }
0546                 }
0547             }
0548         }
0549     }
0550
0551     /* Actually discard the lines. */
0552     for(f = 0; f < 2; ++f)
0553     {
0554         char *discards = discarded[f];
0555         GNULineRef end = filevec[f].buffered_lines;
0556         GNULineRef j = 0;
0557         for(i = 0; i < end; ++i)
0558             if(minimal || discards[i] == 0)
0559             {
0560                 filevec[f].undiscarded[j] = filevec[f].equivs[i];
0561                 filevec[f].realindexes[j++] = i;
0562             }
0563             else
0564                 filevec[f].changed[i] = true;
0565         filevec[f].nondiscarded_lines = j;
0566     }
0567
0568     free(discarded[0]);
0569     free(equiv_count[0]);
0570 }
0571
0572 /* Adjust inserts/deletes of identical lines to join changes
0573    as much as possible.
0574
0575    We do something when a run of changed lines include a
0576    line at one end and have an excluded, identical line at the other.
0577    We are free to choose which identical line is included.
0578    `compareseq' usually chooses the one at the beginning,
0579    but usually it is cleaner to consider the following identical line
0580    to be the "change".  */
0581
0582 void GnuDiff::shift_boundaries(file_data filevec[])
0583 {
0584     qint32 f;
0585
0586     for(f = 0; f < 2; ++f)
0587     {
0588         bool *changed = filevec[f].changed;
0589         bool const *other_changed = filevec[1 - f].changed;
0590         GNULineRef const *equivs = filevec[f].equivs;
0591         GNULineRef i = 0;
0592         GNULineRef j = 0;
0593         GNULineRef i_end = filevec[f].buffered_lines;
0594
0595         while(true)
0596         {
0597             GNULineRef runlength, start, corresponding;
0598
0599             /* Scan forwards to find beginning of another run of changes.
0600          Also keep track of the corresponding point in the other file.  */
0601
0602             while(i < i_end && !changed[i])
0603             {
0604                 while(other_changed[j++])
0605                     continue;
0606                 i++;
0607             }
0608
0609             if(i == i_end)
0610                 break;
0611
0612             start = i;
0613
0614             /* Find the end of this run of changes.  */
0615
0616             while(changed[++i])
0617                 continue;
0618             while(other_changed[j])
0619                 j++;
0620
0621             do
0622             {
0623                 /* Record the length of this run of changes, so that
0624          we can later determine whether the run has grown.  */
0625                 runlength = i - start;
0626
0627                 /* Move the changed region back, so long as the
0628          previous unchanged line matches the last changed one.
0629          This merges with previous changed regions.  */
0630
0631                 while(start && equivs[start - 1] == equivs[i - 1])
0632                 {
0633                     changed[--start] = true;
0634                     changed[--i] = false;
0635                     while(changed[start - 1])
0636                         start--;
0637                     while(other_changed[--j])
0638                         continue;
0639                 }
0640
0641                 /* Set CORRESPONDING to the end of the changed run, at the last
0642          point where it corresponds to a changed run in the other file.
0643          CORRESPONDING == I_END means no such point has been found.  */
0644                 corresponding = other_changed[j - 1] ? i : i_end;
0645
0646                 /* Move the changed region forward, so long as the
0647          first changed line matches the following unchanged one.
0648          This merges with following changed regions.
0649          Do this second, so that if there are no merges,
0650          the changed region is moved forward as far as possible.  */
0651
0652                 while(i != i_end && equivs[start] == equivs[i])
0653                 {
0654                     changed[start++] = false;
0655                     changed[i++] = true;
0656                     while(changed[i])
0657                         i++;
0658                     while(other_changed[++j])
0659                         corresponding = i;
0660                 }
0661             } while(runlength != i - start);
0662
0663             /* If possible, move the fully-merged run of changes
0664          back to a corresponding run in the other file.  */
0665
0666             while(corresponding < i)
0667             {
0668                 changed[--start] = true;
0669                 changed[--i] = false;
0670                 while(other_changed[--j])
0671                     continue;
0672             }
0673         }
0674     }
0675 }
0676
0677 /* Cons an additional entry onto the front of an edit script OLD.
0678    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
0679    DELETED is the number of lines deleted here from file 0.
0680    INSERTED is the number of lines inserted here in file 1.
0681
0682    If DELETED is 0 then LINE0 is the number of the line before
0683    which the insertion was done; vice versa for INSERTED and LINE1.  */
0684
0685 GnuDiff::change *GnuDiff::add_change(GNULineRef line0, GNULineRef line1, GNULineRef deleted, GNULineRef inserted, change *old)
0686 {
0687     change *newChange = (change *)xmalloc(sizeof(*newChange));
0688
0689     newChange->line0 = line0;
0690     newChange->line1 = line1;
0691     newChange->inserted = inserted;
0692     newChange->deleted = deleted;
0693     newChange->link = old;
0694     return newChange;
0695 }
0696
0697 /* Scan the tables of which lines are inserted and deleted,
0698    producing an edit script in reverse order.  */
0699
0700 GnuDiff::change *GnuDiff::build_reverse_script(file_data const filevec[])
0701 {
0702     change *script = nullptr;
0703     bool *changed0 = filevec[0].changed;
0704     bool *changed1 = filevec[1].changed;
0705     GNULineRef len0 = filevec[0].buffered_lines;
0706     GNULineRef len1 = filevec[1].buffered_lines;
0707
0708     /* Note that changedN[len0] does exist, and is 0.  */
0709
0710     GNULineRef i0 = 0, i1 = 0;
0711
0712     while(i0 < len0 || i1 < len1)
0713     {
0714         if(changed0[i0] | changed1[i1])
0715         {
0716             GNULineRef line0 = i0, line1 = i1;
0717
0718             /* Find # lines changed here in each file.  */
0719             while(changed0[i0]) ++i0;
0720             while(changed1[i1]) ++i1;
0721
0722             /* Record this change.  */
0723             script = add_change(line0, line1, i0 - line0, i1 - line1, script);
0724         }
0725
0726         /* We have reached lines in the two files that match each other.  */
0727         i0++, i1++;
0728     }
0729
0730     return script;
0731 }
0732
0733 /* Scan the tables of which lines are inserted and deleted,
0734    producing an edit script in forward order.  */
0735
0736 GnuDiff::change *GnuDiff::build_script(file_data const filevec[])
0737 {
0738     change *script = nullptr;
0739     bool *changed0 = filevec[0].changed;
0740     bool *changed1 = filevec[1].changed;
0741     GNULineRef i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
0742
0743     /* Note that changedN[-1] does exist, and is 0.  */
0744
0745     while(i0 >= 0 || i1 >= 0)
0746     {
0747         if(changed0[i0 - 1] | changed1[i1 - 1])
0748         {
0749             GNULineRef line0 = i0, line1 = i1;
0750
0751             /* Find # lines changed here in each file.  */
0752             while(changed0[i0 - 1]) --i0;
0753             while(changed1[i1 - 1]) --i1;
0754
0755             /* Record this change.  */
0756             script = add_change(i0, i1, line0 - i0, line1 - i1, script);
0757         }
0758
0759         /* We have reached lines in the two files that match each other.  */
0760         i0--, i1--;
0761     }
0762
0763     return script;
0764 }
0765
0766 /* Report the differences of two files.  */
0767 GnuDiff::change *GnuDiff::diff_2_files(comparison *cmp)
0768 {
0769     GNULineRef diags;
0770     qint32 f;
0771     change *script;
0772
0773     read_files(cmp->file, files_can_be_treated_as_binary);
0774
0775     {
0776         /* Allocate vectors for the results of comparison:
0777      a flag for each line of each file, saying whether that line
0778      is an insertion or deletion.
0779      Allocate an extra element, always 0, at each end of each vector.  */
0780
0781         size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
0782         bool *flag_space = (bool *)zalloc(s * sizeof(*flag_space));
0783         cmp->file[0].changed = flag_space + 1;
0784         cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
0785
0786         /* Some lines are obviously insertions or deletions
0787      because they don't match anything.  Detect them now, and
0788      avoid even thinking about them in the main comparison algorithm.  */
0789
0790         discard_confusing_lines(cmp->file);
0791
0792         /* Now do the main comparison algorithm, considering just the
0793      undiscarded lines.  */
0794
0795         xvec = cmp->file[0].undiscarded;
0796         yvec = cmp->file[1].undiscarded;
0797         diags = (cmp->file[0].nondiscarded_lines + cmp->file[1].nondiscarded_lines + 3);
0798         fdiag = (GNULineRef *)xmalloc(diags * (2 * sizeof(*fdiag)));
0799         bdiag = fdiag + diags;
0800         fdiag += cmp->file[1].nondiscarded_lines + 1;
0801         bdiag += cmp->file[1].nondiscarded_lines + 1;
0802
0803         /* Set TOO_EXPENSIVE to be approximate square root of input size,
0804      bounded below by 256.  */
0805         too_expensive = 1;
0806         for(; diags != 0; diags >>= 2)
0807             too_expensive <<= 1;
0808         too_expensive = std::max((GNULineRef)256, too_expensive);
0809
0810         files[0] = cmp->file[0];
0811         files[1] = cmp->file[1];
0812
0813         compareseq(0, cmp->file[0].nondiscarded_lines,
0814                    0, cmp->file[1].nondiscarded_lines, minimal);
0815
0816         free(fdiag - (cmp->file[1].nondiscarded_lines + 1));
0817
0818         /* Modify the results slightly to make them prettier
0819      in cases where that can validly be done.  */
0820
0821         shift_boundaries(cmp->file);
0822
0823         /* Get the results of comparison in the form of a chain
0824          of `change's -- an edit script.  */
0825
0826         script = build_script(cmp->file);
0827
0828         free(cmp->file[0].undiscarded);
0829
0830         free(flag_space);
0831
0832         for(f = 0; f < 2; ++f)
0833         {
0834             free(cmp->file[f].equivs);
0835             free(cmp->file[f].linbuf + cmp->file[f].linbuf_base);
0836         }
0837     }
0838
0839     return script;
0840 }