File indexing completed on 2025-02-16 08:26:11
0001 // clang-format off 0002 /* Analyze file differences for GNU DIFF. 0003 0004 Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003. 0005 The original file was part of GNU DIFF. 0006 0007 0008 Part of KDiff3 - Text Diff And Merge Tool 0009 0010 SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc. 0011 SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de 0012 SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com 0013 SPDX-License-Identifier: GPL-2.0-or-later 0014 */ 0015 0016 /* The basic algorithm is described in: 0017 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 0018 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 0019 see especially section 4.2, which describes the variation used below. 0020 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE 0021 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 0022 at the price of producing suboptimal output for large inputs with 0023 many differences. 0024 0025 The basic algorithm was independently discovered as described in: 0026 "Algorithms for Approximate String Matching", E. Ukkonen, 0027 Information and Control Vol. 64, 1985, pp. 100-118. */ 0028 // clang-format on 0029 0030 #define GDIFF_MAIN 0031 0032 #include "gnudiff_diff.h" 0033 0034 #include <algorithm> // for max, min 0035 #include <stdlib.h> 0036 0037 0038 static GNULineRef *xvec, *yvec; /* Vectors being compared. */ 0039 static GNULineRef *fdiag; /* Vector, indexed by diagonal, containing 0040 1 + the X coordinate of the point furthest 0041 along the given diagonal in the forward 0042 search of the edit matrix. */ 0043 static GNULineRef *bdiag; /* Vector, indexed by diagonal, containing 0044 the X coordinate of the point furthest 0045 along the given diagonal in the backward 0046 search of the edit matrix. */ 0047 static GNULineRef too_expensive; /* Edit scripts longer than this are too 0048 expensive to compute. */ 0049 0050 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ 0051 0052 struct partition { 0053 GNULineRef xmid, ymid; /* Midpoints of this partition. */ 0054 bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */ 0055 bool hi_minimal; /* Likewise for high half. */ 0056 }; 0057 0058 /* Find the midpoint of the shortest edit script for a specified 0059 portion of the two files. 0060 0061 Scan from the beginnings of the files, and simultaneously from the ends, 0062 doing a breadth-first search through the space of edit-sequence. 0063 When the two searches meet, we have found the midpoint of the shortest 0064 edit sequence. 0065 0066 If FIND_MINIMAL is nonzero, find the minimal edit script regardless 0067 of expense. Otherwise, if the search is too expensive, use 0068 heuristics to stop the search and report a suboptimal answer. 0069 0070 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number 0071 XMID - YMID equals the number of inserted lines minus the number 0072 of deleted lines (counting only lines before the midpoint). 0073 Return the approximate edit cost; this is the total number of 0074 lines inserted or deleted (counting only lines before the midpoint), 0075 unless a heuristic is used to terminate the search prematurely. 0076 0077 Set PART->lo_minimal to true iff the minimal edit script for the 0078 left half of the partition is known; similarly for PART->hi_minimal. 0079 0080 This function assumes that the first lines of the specified portions 0081 of the two files do not match, and likewise that the last lines do not 0082 match. The caller must trim matching lines from the beginning and end 0083 of the portions it is going to specify. 0084 0085 If we return the "wrong" partitions, 0086 the worst this can do is cause suboptimal diff output. 0087 It cannot cause incorrect diff output. */ 0088 0089 GNULineRef GnuDiff::diag(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal, 0090 partition *part) const 0091 { 0092 GNULineRef *const fd = fdiag; /* Give the compiler a chance. */ 0093 GNULineRef *const bd = bdiag; /* Additional help for the compiler. */ 0094 GNULineRef const *const xv = xvec; /* Still more help for the compiler. */ 0095 GNULineRef const *const yv = yvec; /* And more and more . . . */ 0096 GNULineRef const dmin = xoff - ylim; /* Minimum valid diagonal. */ 0097 GNULineRef const dmax = xlim - yoff; /* Maximum valid diagonal. */ 0098 GNULineRef const fmid = xoff - yoff; /* Center diagonal of top-down search. */ 0099 GNULineRef const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 0100 GNULineRef fmin = fmid, fmax = fmid; /* Limits of top-down search. */ 0101 GNULineRef bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ 0102 GNULineRef c; /* Cost. */ 0103 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 0104 diagonal with respect to the northwest. */ 0105 0106 fd[fmid] = xoff; 0107 bd[bmid] = xlim; 0108 0109 for(c = 1;; ++c) 0110 { 0111 GNULineRef d; /* Active diagonal. */ 0112 bool big_snake = false; 0113 0114 /* Extend the top-down search by an edit step in each diagonal. */ 0115 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; 0116 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; 0117 for(d = fmax; d >= fmin; d -= 2) 0118 { 0119 GNULineRef x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; 0120 0121 if(tlo >= thi) 0122 x = tlo + 1; 0123 else 0124 x = thi; 0125 oldx = x; 0126 y = x - d; 0127 while(x < xlim && y < ylim && xv[x] == yv[y]) 0128 ++x, ++y; 0129 if(x - oldx > SNAKE_LIMIT) 0130 big_snake = true; 0131 fd[d] = x; 0132 if(odd && bmin <= d && d <= bmax && bd[d] <= x) 0133 { 0134 part->xmid = x; 0135 part->ymid = y; 0136 part->lo_minimal = part->hi_minimal = true; 0137 return 2 * c - 1; 0138 } 0139 } 0140 0141 /* Similarly extend the bottom-up search. */ 0142 bmin > dmin ? bd[--bmin - 1] = GNULINEREF_MAX : ++bmin; 0143 bmax < dmax ? bd[++bmax + 1] = GNULINEREF_MAX : --bmax; 0144 for(d = bmax; d >= bmin; d -= 2) 0145 { 0146 GNULineRef x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; 0147 0148 if(tlo < thi) 0149 x = tlo; 0150 else 0151 x = thi - 1; 0152 oldx = x; 0153 y = x - d; 0154 while(x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) 0155 --x, --y; 0156 if(oldx - x > SNAKE_LIMIT) 0157 big_snake = true; 0158 bd[d] = x; 0159 if(!odd && fmin <= d && d <= fmax && x <= fd[d]) 0160 { 0161 part->xmid = x; 0162 part->ymid = y; 0163 part->lo_minimal = part->hi_minimal = true; 0164 return 2 * c; 0165 } 0166 } 0167 0168 if(find_minimal) 0169 continue; 0170 0171 /* Heuristic: check occasionally for a diagonal that has made 0172 lots of progress compared with the edit distance. 0173 If we have any such, find the one that has made the most 0174 progress and return it as if it had succeeded. 0175 0176 With this heuristic, for files with a constant small density 0177 of changes, the algorithm is linear in the file size. */ 0178 0179 if(200 < c && big_snake && speed_large_files) 0180 { 0181 GNULineRef best; 0182 0183 best = 0; 0184 for(d = fmax; d >= fmin; d -= 2) 0185 { 0186 GNULineRef dd = d - fmid; 0187 GNULineRef x = fd[d]; 0188 GNULineRef y = x - d; 0189 GNULineRef v = (x - xoff) * 2 - dd; 0190 if(v > 12 * (c + (dd < 0 ? -dd : dd))) 0191 { 0192 if(v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) 0193 { 0194 /* We have a good enough best diagonal; 0195 now insist that it end with a significant snake. */ 0196 qint32 k; 0197 0198 for(k = 1; xv[x - k] == yv[y - k]; k++) 0199 if(k == SNAKE_LIMIT) 0200 { 0201 best = v; 0202 part->xmid = x; 0203 part->ymid = y; 0204 break; 0205 } 0206 } 0207 } 0208 } 0209 if(best > 0) 0210 { 0211 part->lo_minimal = true; 0212 part->hi_minimal = false; 0213 return 2 * c - 1; 0214 } 0215 0216 best = 0; 0217 for(d = bmax; d >= bmin; d -= 2) 0218 { 0219 GNULineRef dd = d - bmid; 0220 GNULineRef x = bd[d]; 0221 GNULineRef y = x - d; 0222 GNULineRef v = (xlim - x) * 2 + dd; 0223 if(v > 12 * (c + (dd < 0 ? -dd : dd))) 0224 { 0225 if(v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) 0226 { 0227 /* We have a good enough best diagonal; 0228 now insist that it end with a significant snake. */ 0229 qint32 k; 0230 0231 for(k = 0; xv[x + k] == yv[y + k]; k++) 0232 if(k == SNAKE_LIMIT - 1) 0233 { 0234 best = v; 0235 part->xmid = x; 0236 part->ymid = y; 0237 break; 0238 } 0239 } 0240 } 0241 } 0242 if(best > 0) 0243 { 0244 part->lo_minimal = false; 0245 part->hi_minimal = true; 0246 return 2 * c - 1; 0247 } 0248 } 0249 0250 /* Heuristic: if we've gone well beyond the call of duty, 0251 give up and report halfway between our best results so far. */ 0252 if(c >= too_expensive) 0253 { 0254 GNULineRef fxybest, fxbest; 0255 GNULineRef bxybest, bxbest; 0256 0257 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */ 0258 0259 /* Find forward diagonal that maximizes X + Y. */ 0260 fxybest = -1; 0261 for(d = fmax; d >= fmin; d -= 2) 0262 { 0263 GNULineRef x = std::min(fd[d], xlim); 0264 GNULineRef y = x - d; 0265 if(ylim < y) 0266 x = ylim + d, y = ylim; 0267 if(fxybest < x + y) 0268 { 0269 fxybest = x + y; 0270 fxbest = x; 0271 } 0272 } 0273 0274 /* Find backward diagonal that minimizes X + Y. */ 0275 bxybest = GNULINEREF_MAX; 0276 for(d = bmax; d >= bmin; d -= 2) 0277 { 0278 GNULineRef x = std::max(xoff, bd[d]); 0279 GNULineRef y = x - d; 0280 if(y < yoff) 0281 x = yoff + d, y = yoff; 0282 if(x + y < bxybest) 0283 { 0284 bxybest = x + y; 0285 bxbest = x; 0286 } 0287 } 0288 0289 /* Use the better of the two diagonals. */ 0290 if((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 0291 { 0292 part->xmid = fxbest; 0293 part->ymid = fxybest - fxbest; 0294 part->lo_minimal = true; 0295 part->hi_minimal = false; 0296 } 0297 else 0298 { 0299 part->xmid = bxbest; 0300 part->ymid = bxybest - bxbest; 0301 part->lo_minimal = false; 0302 part->hi_minimal = true; 0303 } 0304 return 2 * c - 1; 0305 } 0306 } 0307 } 0308 0309 /* Compare in detail contiguous subsequences of the two files 0310 which are known, as a whole, to match each other. 0311 0312 The results are recorded in the vectors files[N].changed, by 0313 storing 1 in the element for each line that is an insertion or deletion. 0314 0315 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 0316 0317 Note that XLIM, YLIM are exclusive bounds. 0318 All line numbers are origin-0 and discarded lines are not counted. 0319 0320 If FIND_MINIMAL, find a minimal difference no matter how 0321 expensive it is. */ 0322 0323 void GnuDiff::compareseq(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal) 0324 { 0325 GNULineRef *const xv = xvec; /* Help the compiler. */ 0326 GNULineRef *const yv = yvec; 0327 0328 /* Slide down the bottom initial diagonal. */ 0329 while(xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) 0330 ++xoff, ++yoff; 0331 /* Slide up the top initial diagonal. */ 0332 while(xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) 0333 --xlim, --ylim; 0334 0335 /* Handle simple cases. */ 0336 if(xoff == xlim) 0337 while(yoff < ylim) 0338 files[1].changed[files[1].realindexes[yoff++]] = true; 0339 else if(yoff == ylim) 0340 while(xoff < xlim) 0341 files[0].changed[files[0].realindexes[xoff++]] = true; 0342 else 0343 { 0344 GNULineRef c; 0345 partition part; 0346 0347 /* Find a point of correspondence in the middle of the files. */ 0348 0349 c = diag(xoff, xlim, yoff, ylim, find_minimal, &part); 0350 0351 /* This should be impossible, because it implies that 0352 one of the two subsequences is empty, 0353 and that case was handled above without calling `diag'. */ 0354 assert(c != 1); 0355 0356 /* Use the partitions to split this problem into subproblems. */ 0357 compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal); 0358 compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal); 0359 } 0360 } 0361 0362 /* Discard lines from one file that have no matches in the other file. 0363 0364 A line which is discarded will not be considered by the actual 0365 comparison algorithm; it will be as if that line were not in the file. 0366 The file's `realindexes' table maps virtual line numbers 0367 (which don't count the discarded lines) into real line numbers; 0368 this is how the actual comparison algorithm produces results 0369 that are comprehensible when the discarded lines are counted. 0370 0371 When we discard a line, we also mark it as a deletion or insertion 0372 so that it will be printed in the output. */ 0373 0374 void GnuDiff::discard_confusing_lines(file_data filevec[]) 0375 { 0376 qint32 f; 0377 GNULineRef i; 0378 char *discarded[2]; 0379 GNULineRef *equiv_count[2]; 0380 GNULineRef *p; 0381 0382 /* Allocate our results. */ 0383 p = (GNULineRef *)xmalloc((filevec[0].buffered_lines + filevec[1].buffered_lines) * (2 * sizeof(*p))); 0384 for(f = 0; f < 2; ++f) 0385 { 0386 filevec[f].undiscarded = p; 0387 p += filevec[f].buffered_lines; 0388 filevec[f].realindexes = p; 0389 p += filevec[f].buffered_lines; 0390 } 0391 0392 /* Set up equiv_count[F][I] as the number of lines in file F 0393 that fall in equivalence class I. */ 0394 0395 p = (GNULineRef *)zalloc(filevec[0].equiv_max * (2 * sizeof(*p))); 0396 equiv_count[0] = p; 0397 equiv_count[1] = p + filevec[0].equiv_max; 0398 0399 for(i = 0; i < filevec[0].buffered_lines; ++i) 0400 ++equiv_count[0][filevec[0].equivs[i]]; 0401 for(i = 0; i < filevec[1].buffered_lines; ++i) 0402 ++equiv_count[1][filevec[1].equivs[i]]; 0403 0404 /* Set up tables of which lines are going to be discarded. */ 0405 0406 discarded[0] = (char *)zalloc(filevec[0].buffered_lines + filevec[1].buffered_lines); 0407 discarded[1] = discarded[0] + filevec[0].buffered_lines; 0408 0409 /* Mark to be discarded each line that matches no line of the other file. 0410 If a line matches many lines, mark it as provisionally discardable. */ 0411 0412 for(f = 0; f < 2; ++f) 0413 { 0414 size_t end = filevec[f].buffered_lines; 0415 char *discards = discarded[f]; 0416 GNULineRef *counts = equiv_count[1 - f]; 0417 GNULineRef *equivs = filevec[f].equivs; 0418 size_t many = 5; 0419 size_t tem = end / 64; 0420 0421 /* Multiply MANY by approximate square root of number of lines. 0422 That is the threshold for provisionally discardable lines. */ 0423 while((tem = tem >> 2) > 0) 0424 many *= 2; 0425 0426 for(i = 0; i < (GNULineRef)end; ++i) 0427 { 0428 GNULineRef nmatch; 0429 if(equivs[i] == 0) 0430 continue; 0431 nmatch = counts[equivs[i]]; 0432 if(nmatch == 0) 0433 discards[i] = 1; 0434 else if(nmatch > (GNULineRef)many) 0435 discards[i] = 2; 0436 } 0437 } 0438 0439 /* Don't really discard the provisional lines except when they occur 0440 in a run of discardables, with nonprovisionals at the beginning 0441 and end. */ 0442 0443 for(f = 0; f < 2; ++f) 0444 { 0445 GNULineRef end = filevec[f].buffered_lines; 0446 char *discards = discarded[f]; 0447 0448 for(i = 0; i < end; ++i) 0449 { 0450 /* Cancel provisional discards not in middle of run of discards. */ 0451 if(discards[i] == 2) 0452 discards[i] = 0; 0453 else if(discards[i] != 0) 0454 { 0455 /* We have found a nonprovisional discard. */ 0456 GNULineRef j; 0457 GNULineRef length; 0458 GNULineRef provisional = 0; 0459 0460 /* Find end of this run of discardable lines. 0461 Count how many are provisionally discardable. */ 0462 for(j = i; j < end; ++j) 0463 { 0464 if(discards[j] == 0) 0465 break; 0466 if(discards[j] == 2) 0467 ++provisional; 0468 } 0469 0470 /* Cancel provisional discards at end, and shrink the run. */ 0471 while(j > i && discards[j - 1] == 2) 0472 discards[--j] = 0, --provisional; 0473 0474 /* Now we have the length of a run of discardable lines 0475 whose first and last are not provisional. */ 0476 length = j - i; 0477 0478 /* If 1/4 of the lines in the run are provisional, 0479 cancel discarding of all provisional lines in the run. */ 0480 if(provisional * 4 > length) 0481 { 0482 while(j > i) 0483 if(discards[--j] == 2) 0484 discards[j] = 0; 0485 } 0486 else 0487 { 0488 GNULineRef consec; 0489 GNULineRef minimum = 1; 0490 GNULineRef tem = length >> 2; 0491 0492 /* MINIMUM is approximate square root of LENGTH/4. 0493 A subrun of two or more provisionals can stand 0494 when LENGTH is at least 16. 0495 A subrun of 4 or more can stand when LENGTH >= 64. */ 0496 while(0 < (tem >>= 2)) 0497 minimum <<= 1; 0498 minimum++; 0499 0500 /* Cancel any subrun of MINIMUM or more provisionals 0501 within the larger run. */ 0502 for(j = 0, consec = 0; j < length; ++j) 0503 if(discards[i + j] != 2) 0504 consec = 0; 0505 else if(minimum == ++consec) 0506 /* Back up to start of subrun, to cancel it all. */ 0507 j -= consec; 0508 else if(minimum < consec) 0509 discards[i + j] = 0; 0510 0511 /* Scan from beginning of run 0512 until we find 3 or more nonprovisionals in a row 0513 or until the first nonprovisional at least 8 lines in. 0514 Until that point, cancel any provisionals. */ 0515 for(j = 0, consec = 0; j < length; ++j) 0516 { 0517 if(j >= 8 && discards[i + j] == 1) 0518 break; 0519 if(discards[i + j] == 2) 0520 consec = 0, discards[i + j] = 0; 0521 else if(discards[i + j] == 0) 0522 consec = 0; 0523 else 0524 consec++; 0525 if(consec == 3) 0526 break; 0527 } 0528 0529 /* I advances to the last line of the run. */ 0530 i += length - 1; 0531 0532 /* Same thing, from end. */ 0533 for(j = 0, consec = 0; j < length; ++j) 0534 { 0535 if(j >= 8 && discards[i - j] == 1) 0536 break; 0537 if(discards[i - j] == 2) 0538 consec = 0, discards[i - j] = 0; 0539 else if(discards[i - j] == 0) 0540 consec = 0; 0541 else 0542 consec++; 0543 if(consec == 3) 0544 break; 0545 } 0546 } 0547 } 0548 } 0549 } 0550 0551 /* Actually discard the lines. */ 0552 for(f = 0; f < 2; ++f) 0553 { 0554 char *discards = discarded[f]; 0555 GNULineRef end = filevec[f].buffered_lines; 0556 GNULineRef j = 0; 0557 for(i = 0; i < end; ++i) 0558 if(minimal || discards[i] == 0) 0559 { 0560 filevec[f].undiscarded[j] = filevec[f].equivs[i]; 0561 filevec[f].realindexes[j++] = i; 0562 } 0563 else 0564 filevec[f].changed[i] = true; 0565 filevec[f].nondiscarded_lines = j; 0566 } 0567 0568 free(discarded[0]); 0569 free(equiv_count[0]); 0570 } 0571 0572 /* Adjust inserts/deletes of identical lines to join changes 0573 as much as possible. 0574 0575 We do something when a run of changed lines include a 0576 line at one end and have an excluded, identical line at the other. 0577 We are free to choose which identical line is included. 0578 `compareseq' usually chooses the one at the beginning, 0579 but usually it is cleaner to consider the following identical line 0580 to be the "change". */ 0581 0582 void GnuDiff::shift_boundaries(file_data filevec[]) 0583 { 0584 qint32 f; 0585 0586 for(f = 0; f < 2; ++f) 0587 { 0588 bool *changed = filevec[f].changed; 0589 bool const *other_changed = filevec[1 - f].changed; 0590 GNULineRef const *equivs = filevec[f].equivs; 0591 GNULineRef i = 0; 0592 GNULineRef j = 0; 0593 GNULineRef i_end = filevec[f].buffered_lines; 0594 0595 while(true) 0596 { 0597 GNULineRef runlength, start, corresponding; 0598 0599 /* Scan forwards to find beginning of another run of changes. 0600 Also keep track of the corresponding point in the other file. */ 0601 0602 while(i < i_end && !changed[i]) 0603 { 0604 while(other_changed[j++]) 0605 continue; 0606 i++; 0607 } 0608 0609 if(i == i_end) 0610 break; 0611 0612 start = i; 0613 0614 /* Find the end of this run of changes. */ 0615 0616 while(changed[++i]) 0617 continue; 0618 while(other_changed[j]) 0619 j++; 0620 0621 do 0622 { 0623 /* Record the length of this run of changes, so that 0624 we can later determine whether the run has grown. */ 0625 runlength = i - start; 0626 0627 /* Move the changed region back, so long as the 0628 previous unchanged line matches the last changed one. 0629 This merges with previous changed regions. */ 0630 0631 while(start && equivs[start - 1] == equivs[i - 1]) 0632 { 0633 changed[--start] = true; 0634 changed[--i] = false; 0635 while(changed[start - 1]) 0636 start--; 0637 while(other_changed[--j]) 0638 continue; 0639 } 0640 0641 /* Set CORRESPONDING to the end of the changed run, at the last 0642 point where it corresponds to a changed run in the other file. 0643 CORRESPONDING == I_END means no such point has been found. */ 0644 corresponding = other_changed[j - 1] ? i : i_end; 0645 0646 /* Move the changed region forward, so long as the 0647 first changed line matches the following unchanged one. 0648 This merges with following changed regions. 0649 Do this second, so that if there are no merges, 0650 the changed region is moved forward as far as possible. */ 0651 0652 while(i != i_end && equivs[start] == equivs[i]) 0653 { 0654 changed[start++] = false; 0655 changed[i++] = true; 0656 while(changed[i]) 0657 i++; 0658 while(other_changed[++j]) 0659 corresponding = i; 0660 } 0661 } while(runlength != i - start); 0662 0663 /* If possible, move the fully-merged run of changes 0664 back to a corresponding run in the other file. */ 0665 0666 while(corresponding < i) 0667 { 0668 changed[--start] = true; 0669 changed[--i] = false; 0670 while(other_changed[--j]) 0671 continue; 0672 } 0673 } 0674 } 0675 } 0676 0677 /* Cons an additional entry onto the front of an edit script OLD. 0678 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 0679 DELETED is the number of lines deleted here from file 0. 0680 INSERTED is the number of lines inserted here in file 1. 0681 0682 If DELETED is 0 then LINE0 is the number of the line before 0683 which the insertion was done; vice versa for INSERTED and LINE1. */ 0684 0685 GnuDiff::change *GnuDiff::add_change(GNULineRef line0, GNULineRef line1, GNULineRef deleted, GNULineRef inserted, change *old) 0686 { 0687 change *newChange = (change *)xmalloc(sizeof(*newChange)); 0688 0689 newChange->line0 = line0; 0690 newChange->line1 = line1; 0691 newChange->inserted = inserted; 0692 newChange->deleted = deleted; 0693 newChange->link = old; 0694 return newChange; 0695 } 0696 0697 /* Scan the tables of which lines are inserted and deleted, 0698 producing an edit script in reverse order. */ 0699 0700 GnuDiff::change *GnuDiff::build_reverse_script(file_data const filevec[]) 0701 { 0702 change *script = nullptr; 0703 bool *changed0 = filevec[0].changed; 0704 bool *changed1 = filevec[1].changed; 0705 GNULineRef len0 = filevec[0].buffered_lines; 0706 GNULineRef len1 = filevec[1].buffered_lines; 0707 0708 /* Note that changedN[len0] does exist, and is 0. */ 0709 0710 GNULineRef i0 = 0, i1 = 0; 0711 0712 while(i0 < len0 || i1 < len1) 0713 { 0714 if(changed0[i0] | changed1[i1]) 0715 { 0716 GNULineRef line0 = i0, line1 = i1; 0717 0718 /* Find # lines changed here in each file. */ 0719 while(changed0[i0]) ++i0; 0720 while(changed1[i1]) ++i1; 0721 0722 /* Record this change. */ 0723 script = add_change(line0, line1, i0 - line0, i1 - line1, script); 0724 } 0725 0726 /* We have reached lines in the two files that match each other. */ 0727 i0++, i1++; 0728 } 0729 0730 return script; 0731 } 0732 0733 /* Scan the tables of which lines are inserted and deleted, 0734 producing an edit script in forward order. */ 0735 0736 GnuDiff::change *GnuDiff::build_script(file_data const filevec[]) 0737 { 0738 change *script = nullptr; 0739 bool *changed0 = filevec[0].changed; 0740 bool *changed1 = filevec[1].changed; 0741 GNULineRef i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines; 0742 0743 /* Note that changedN[-1] does exist, and is 0. */ 0744 0745 while(i0 >= 0 || i1 >= 0) 0746 { 0747 if(changed0[i0 - 1] | changed1[i1 - 1]) 0748 { 0749 GNULineRef line0 = i0, line1 = i1; 0750 0751 /* Find # lines changed here in each file. */ 0752 while(changed0[i0 - 1]) --i0; 0753 while(changed1[i1 - 1]) --i1; 0754 0755 /* Record this change. */ 0756 script = add_change(i0, i1, line0 - i0, line1 - i1, script); 0757 } 0758 0759 /* We have reached lines in the two files that match each other. */ 0760 i0--, i1--; 0761 } 0762 0763 return script; 0764 } 0765 0766 /* Report the differences of two files. */ 0767 GnuDiff::change *GnuDiff::diff_2_files(comparison *cmp) 0768 { 0769 GNULineRef diags; 0770 qint32 f; 0771 change *script; 0772 0773 read_files(cmp->file, files_can_be_treated_as_binary); 0774 0775 { 0776 /* Allocate vectors for the results of comparison: 0777 a flag for each line of each file, saying whether that line 0778 is an insertion or deletion. 0779 Allocate an extra element, always 0, at each end of each vector. */ 0780 0781 size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4; 0782 bool *flag_space = (bool *)zalloc(s * sizeof(*flag_space)); 0783 cmp->file[0].changed = flag_space + 1; 0784 cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3; 0785 0786 /* Some lines are obviously insertions or deletions 0787 because they don't match anything. Detect them now, and 0788 avoid even thinking about them in the main comparison algorithm. */ 0789 0790 discard_confusing_lines(cmp->file); 0791 0792 /* Now do the main comparison algorithm, considering just the 0793 undiscarded lines. */ 0794 0795 xvec = cmp->file[0].undiscarded; 0796 yvec = cmp->file[1].undiscarded; 0797 diags = (cmp->file[0].nondiscarded_lines + cmp->file[1].nondiscarded_lines + 3); 0798 fdiag = (GNULineRef *)xmalloc(diags * (2 * sizeof(*fdiag))); 0799 bdiag = fdiag + diags; 0800 fdiag += cmp->file[1].nondiscarded_lines + 1; 0801 bdiag += cmp->file[1].nondiscarded_lines + 1; 0802 0803 /* Set TOO_EXPENSIVE to be approximate square root of input size, 0804 bounded below by 256. */ 0805 too_expensive = 1; 0806 for(; diags != 0; diags >>= 2) 0807 too_expensive <<= 1; 0808 too_expensive = std::max((GNULineRef)256, too_expensive); 0809 0810 files[0] = cmp->file[0]; 0811 files[1] = cmp->file[1]; 0812 0813 compareseq(0, cmp->file[0].nondiscarded_lines, 0814 0, cmp->file[1].nondiscarded_lines, minimal); 0815 0816 free(fdiag - (cmp->file[1].nondiscarded_lines + 1)); 0817 0818 /* Modify the results slightly to make them prettier 0819 in cases where that can validly be done. */ 0820 0821 shift_boundaries(cmp->file); 0822 0823 /* Get the results of comparison in the form of a chain 0824 of `change's -- an edit script. */ 0825 0826 script = build_script(cmp->file); 0827 0828 free(cmp->file[0].undiscarded); 0829 0830 free(flag_space); 0831 0832 for(f = 0; f < 2; ++f) 0833 { 0834 free(cmp->file[f].equivs); 0835 free(cmp->file[f].linbuf + cmp->file[f].linbuf_base); 0836 } 0837 } 0838 0839 return script; 0840 }