File indexing completed on 2024-10-13 05:04:09
0001 // clang-format off 0002 /* File I/O for GNU DIFF. 0003 0004 Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003, 2004, 2005. 0005 The original file was part of GNU DIFF. 0006 0007 Part of KDiff3 - Text Diff And Merge Tool 0008 0009 SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc. 0010 SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de 0011 SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com 0012 SPDX-License-Identifier: GPL-2.0-or-later 0013 */ 0014 // clang-format on 0015 0016 #include "gnudiff_diff.h" 0017 0018 #include "Utils.h" 0019 0020 #include <stdlib.h> 0021 #include <type_traits> 0022 0023 /* Rotate an unsigned value to the left. */ 0024 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n))) 0025 0026 /* Given a hash value and a new character, return a new hash value. */ 0027 #define HASH(h, c) ((c) + ROL(h, 7)) 0028 0029 /* The type of a hash value. */ 0030 typedef size_t hash_value; 0031 static_assert(std::is_unsigned<hash_value>::value, "hash_value must be signed."); 0032 0033 /* Lines are put into equivalence classes of lines that match in lines_differ. 0034 Each equivalence class is represented by one of these structures, 0035 but only while the classes are being computed. 0036 Afterward, each class is represented by a number. */ 0037 struct equivclass { 0038 GNULineRef next; /* Next item in this bucket. */ 0039 hash_value hash; /* Hash of lines in this class. */ 0040 const QChar *line; /* A line that fits this class. */ 0041 size_t length; /* That line's length, not counting its newline. */ 0042 }; 0043 0044 /* Hash-table: array of buckets, each being a chain of equivalence classes. 0045 buckets[-1] is reserved for incomplete lines. */ 0046 static GNULineRef *buckets; 0047 0048 /* Number of buckets in the hash table array, not counting buckets[-1]. */ 0049 static size_t nbuckets; 0050 0051 /* Array in which the equivalence classes are allocated. 0052 The bucket-chains go through the elements in this array. 0053 The number of an equivalence class is its index in this array. */ 0054 static equivclass *equivs; 0055 0056 /* Index of first free element in the array `equivs'. */ 0057 static GNULineRef equivs_index; 0058 0059 /* Number of elements allocated in the array `equivs'. */ 0060 static GNULineRef equivs_alloc; 0061 0062 /* Check for binary files and compare them for exact identity. */ 0063 0064 /* Return 1 if BUF contains a non text character. 0065 SIZE is the number of characters in BUF. */ 0066 0067 #define binary_file_p(buf, size) (memchr(buf, 0, size) != 0) 0068 0069 /* Compare two lines (typically one from each input file) 0070 according to the command line options. 0071 For efficiency, this is invoked only when the lines do not match exactly 0072 but an option like -i might cause us to ignore the difference. 0073 Return nonzero if the lines differ. */ 0074 0075 bool GnuDiff::lines_differ(const QChar *s1, size_t len1, const QChar *s2, size_t len2) 0076 { 0077 const QChar *t1 = s1; 0078 const QChar *t2 = s2; 0079 const QChar *s1end = s1 + len1; 0080 const QChar *s2end = s2 + len2; 0081 0082 for(;; ++t1, ++t2) 0083 { 0084 /* Test for exact char equality first, since it's a common case. */ 0085 if(t1 != s1end && t2 != s2end && *t1 == *t2) 0086 continue; 0087 else 0088 { 0089 while(t1 != s1end && 0090 ((bIgnoreWhiteSpace && isspace(t1->unicode())) || 0091 (bIgnoreNumbers && (t1->isDigit() || *t1 == '-' || *t1 == '.')))) 0092 { 0093 ++t1; 0094 } 0095 0096 while(t2 != s2end && 0097 ((bIgnoreWhiteSpace && isspace(t2->unicode())) || 0098 (bIgnoreNumbers && (t2->isDigit() || *t2 == '-' || *t2 == '.')))) 0099 { 0100 ++t2; 0101 } 0102 0103 if(t1 != s1end && t2 != s2end) 0104 { 0105 if(ignore_case) 0106 { /* Lowercase comparison. */ 0107 if(t1->toLower() == t2->toLower()) 0108 continue; 0109 } 0110 else if(*t1 == *t2) 0111 continue; 0112 else 0113 return true; 0114 } 0115 else if(t1 == s1end && t2 == s2end) 0116 return false; 0117 else 0118 return true; 0119 } 0120 } 0121 return false; 0122 } 0123 0124 /* Split the file into lines, simultaneously computing the equivalence 0125 class for each line. */ 0126 0127 void GnuDiff::find_and_hash_each_line(file_data *current) 0128 { 0129 hash_value h; 0130 const QChar *p = current->prefix_end; 0131 QChar c; 0132 GNULineRef i, *bucket; 0133 size_t length; 0134 0135 /* Cache often-used quantities in local variables to help the compiler. */ 0136 const QChar **linbuf = current->linbuf; 0137 GNULineRef alloc_lines = current->alloc_lines; 0138 GNULineRef line = 0; 0139 GNULineRef linbuf_base = current->linbuf_base; 0140 GNULineRef *cureqs = (GNULineRef *)xmalloc(alloc_lines * sizeof(*cureqs)); 0141 equivclass *eqs = equivs; 0142 GNULineRef eqs_index = equivs_index; 0143 GNULineRef eqs_alloc = equivs_alloc; 0144 const QChar *suffix_begin = current->suffix_begin; 0145 const QChar *bufend = current->buffer + current->buffered; 0146 bool diff_length_compare_anyway = 0147 ignore_white_space != IGNORE_NO_WHITE_SPACE || bIgnoreNumbers; 0148 bool same_length_diff_contents_compare_anyway = 0149 diff_length_compare_anyway | ignore_case; 0150 0151 while(p < suffix_begin) 0152 { 0153 const QChar *ip = p; 0154 0155 h = 0; 0156 0157 /* Hash this line until we find a newline or bufend is reached. */ 0158 if(ignore_case) 0159 switch(ignore_white_space) 0160 { 0161 case IGNORE_ALL_SPACE: 0162 while(p < bufend && !Utils::isEndOfLine(c = *p)) 0163 { 0164 if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.')))) 0165 h = HASH(h, c.toLower().unicode()); 0166 ++p; 0167 } 0168 break; 0169 0170 default: 0171 while(p < bufend && !Utils::isEndOfLine(c = *p)) 0172 { 0173 h = HASH(h, c.toLower().unicode()); 0174 ++p; 0175 } 0176 break; 0177 } 0178 else 0179 switch(ignore_white_space) 0180 { 0181 case IGNORE_ALL_SPACE: 0182 while(p < bufend && !Utils::isEndOfLine(c = *p)) 0183 { 0184 if(!(isspace(c.unicode()) || (bIgnoreNumbers && (c.isDigit() || c == '-' || c == '.')))) 0185 h = HASH(h, c.unicode()); 0186 ++p; 0187 } 0188 break; 0189 0190 default: 0191 while(p < bufend && !Utils::isEndOfLine(c = *p)) 0192 { 0193 h = HASH(h, c.unicode()); 0194 ++p; 0195 } 0196 break; 0197 } 0198 0199 bucket = &buckets[h % nbuckets]; 0200 length = p - ip; 0201 ++p; 0202 0203 for(i = *bucket;; i = eqs[i].next) 0204 if(!i) 0205 { 0206 /* Create a new equivalence class in this bucket. */ 0207 i = eqs_index++; 0208 if(i == eqs_alloc) 0209 { 0210 if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(*eqs))) <= eqs_alloc) 0211 xalloc_die(); 0212 eqs_alloc *= 2; 0213 eqs = (equivclass *)xrealloc(eqs, eqs_alloc * sizeof(*eqs)); 0214 } 0215 eqs[i].next = *bucket; 0216 eqs[i].hash = h; 0217 eqs[i].line = ip; 0218 eqs[i].length = length; 0219 *bucket = i; 0220 break; 0221 } 0222 else if(eqs[i].hash == h) 0223 { 0224 const QChar *eqline = eqs[i].line; 0225 0226 /* Reuse existing class if lines_differ reports the lines 0227 equal. */ 0228 if(eqs[i].length == length) 0229 { 0230 /* Reuse existing equivalence class if the lines are identical. 0231 This detects the common case of exact identity 0232 faster than lines_differ would. */ 0233 if(memcmp(eqline, ip, length * sizeof(QChar)) == 0) 0234 break; 0235 if(!same_length_diff_contents_compare_anyway) 0236 continue; 0237 } 0238 else if(!diff_length_compare_anyway) 0239 continue; 0240 0241 if(!lines_differ(eqline, eqs[i].length, ip, length)) 0242 break; 0243 } 0244 0245 /* Maybe increase the size of the line table. */ 0246 if(line == alloc_lines) 0247 { 0248 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 0249 if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base) 0250 xalloc_die(); 0251 alloc_lines = 2 * alloc_lines - linbuf_base; 0252 cureqs = (GNULineRef *)xrealloc(cureqs, alloc_lines * sizeof(*cureqs)); 0253 linbuf += linbuf_base; 0254 linbuf = (const QChar **)xrealloc(linbuf, 0255 (alloc_lines - linbuf_base) * sizeof(ptrdiff_t)); 0256 linbuf -= linbuf_base; 0257 } 0258 linbuf[line] = ip; 0259 cureqs[line] = i; 0260 ++line; 0261 } 0262 0263 current->buffered_lines = line; 0264 0265 for(i = 0;; ++i) 0266 { 0267 /* Record the line start for lines in the suffix that we care about. 0268 Record one more line start than lines, 0269 so that we can compute the length of any buffered line. */ 0270 if(line == alloc_lines) 0271 { 0272 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 0273 if((GNULineRef)(GNULINEREF_MAX / 3) <= alloc_lines || (GNULineRef)(GNULINEREF_MAX / sizeof(*cureqs)) <= 2 * alloc_lines - linbuf_base || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines - linbuf_base) 0274 xalloc_die(); 0275 alloc_lines = 2 * alloc_lines - linbuf_base; 0276 linbuf += linbuf_base; 0277 linbuf = (const QChar **)xrealloc(linbuf, 0278 (alloc_lines - linbuf_base) * sizeof(ptrdiff_t)); 0279 linbuf -= linbuf_base; 0280 } 0281 linbuf[line] = p; 0282 0283 if(p >= bufend) 0284 break; 0285 0286 if(context <= i && no_diff_means_no_output) 0287 break; 0288 0289 line++; 0290 0291 while(p < bufend && !Utils::isEndOfLine(*p++)) 0292 continue; 0293 } 0294 0295 /* Done with cache in local variables. */ 0296 current->linbuf = linbuf; 0297 current->valid_lines = line; 0298 current->alloc_lines = alloc_lines; 0299 current->equivs = cureqs; 0300 equivs = eqs; 0301 equivs_alloc = eqs_alloc; 0302 equivs_index = eqs_index; 0303 } 0304 0305 /* We have found N lines in a buffer of size S; guess the 0306 proportionate number of lines that will be found in a buffer of 0307 size T. However, do not guess a number of lines so large that the 0308 resulting line table might cause overflow in size calculations. */ 0309 GNULineRef 0310 GnuDiff::guess_lines(GNULineRef n, size_t s, size_t t) 0311 { 0312 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 0313 size_t guessed_lines = std::max((size_t)1, t / guessed_bytes_per_line); 0314 return (GNULineRef)std::min((GNULineRef)guessed_lines, (GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t) + 1) - 5)) + 5; 0315 } 0316 0317 /* Given a vector of two file_data objects, find the identical 0318 prefixes and suffixes of each object. */ 0319 0320 void GnuDiff::find_identical_ends(file_data filevec[]) 0321 { 0322 /* Find identical prefix. */ 0323 const QChar *p0, *p1, *buffer0, *buffer1; 0324 p0 = buffer0 = filevec[0].buffer; 0325 p1 = buffer1 = filevec[1].buffer; 0326 size_t n0, n1; 0327 n0 = filevec[0].buffered; 0328 n1 = filevec[1].buffered; 0329 const QChar *const pEnd0 = p0 + n0; 0330 const QChar *const pEnd1 = p1 + n1; 0331 0332 if(p0 == p1) 0333 /* The buffers are the same; sentinels won't work. */ 0334 p0 = p1 += n1; 0335 else 0336 { 0337 /* Loop until first mismatch, or end. */ 0338 while(p0 != pEnd0 && p1 != pEnd1 && *p0 == *p1) 0339 { 0340 p0++; 0341 p1++; 0342 } 0343 } 0344 0345 /* Now P0 and P1 point at the first nonmatching characters. */ 0346 0347 /* Skip back to last line-beginning in the prefix. */ 0348 while(p0 != buffer0 && !Utils::isEndOfLine(p0[-1])) 0349 p0--, p1--; 0350 0351 /* Record the prefix. */ 0352 filevec[0].prefix_end = p0; 0353 filevec[1].prefix_end = p1; 0354 0355 /* Find identical suffix. */ 0356 0357 /* P0 and P1 point beyond the last chars not yet compared. */ 0358 p0 = buffer0 + n0; 0359 p1 = buffer1 + n1; 0360 0361 const QChar *end0, *beg0; 0362 end0 = p0; /* Addr of last char in file 0. */ 0363 0364 /* Get value of P0 at which we should stop scanning backward: 0365 this is when either P0 or P1 points just past the last char 0366 of the identical prefix. */ 0367 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 0368 0369 /* Scan back until chars don't match or we reach that point. */ 0370 for(; p0 != beg0; p0--, p1--) 0371 { 0372 if(*p0 != *p1) 0373 { 0374 /* Point at the first char of the matching suffix. */ 0375 beg0 = p0; 0376 break; 0377 } 0378 } 0379 0380 // Go to the next line (skip last line with a difference) 0381 if(p0 != end0) 0382 { 0383 if(*p0 != *p1) 0384 ++p0; 0385 while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++)) 0386 continue; 0387 } 0388 0389 p1 += p0 - beg0; 0390 0391 /* Record the suffix. */ 0392 filevec[0].suffix_begin = p0; 0393 filevec[1].suffix_begin = p1; 0394 0395 /* Calculate number of lines of prefix to save. 0396 0397 prefix_count == 0 means save the whole prefix; 0398 we need this for options like -D that output the whole file, 0399 or for enormous contexts (to avoid worrying about arithmetic overflow). 0400 We also need it for options like -F that output some preceding line; 0401 at least we will need to find the last few lines, 0402 but since we don't know how many, it's easiest to find them all. 0403 0404 Otherwise, prefix_count != 0. Save just prefix_count lines at start 0405 of the line buffer; they'll be moved to the proper location later. 0406 Handle 1 more line than the context says (because we count 1 too many), 0407 rounded up to the next power of 2 to speed index computation. */ 0408 0409 const QChar **linbuf0, **linbuf1; 0410 GNULineRef alloc_lines0, alloc_lines1; 0411 GNULineRef buffered_prefix, prefix_count, prefix_mask; 0412 GNULineRef middle_guess, suffix_guess; 0413 if(no_diff_means_no_output && context < (GNULineRef)(GNULINEREF_MAX / 4) && context < (GNULineRef)(n0)) 0414 { 0415 middle_guess = guess_lines(0, 0, p0 - filevec[0].prefix_end); 0416 suffix_guess = guess_lines(0, 0, buffer0 + n0 - p0); 0417 for(prefix_count = 1; prefix_count <= context; prefix_count *= 2) 0418 continue; 0419 alloc_lines0 = (prefix_count + middle_guess + std::min(context, suffix_guess)); 0420 } 0421 else 0422 { 0423 prefix_count = 0; 0424 alloc_lines0 = guess_lines(0, 0, n0); 0425 } 0426 0427 prefix_mask = prefix_count - 1; 0428 GNULineRef lines = 0; 0429 linbuf0 = (const QChar **)xmalloc(alloc_lines0 * sizeof(ptrdiff_t)); 0430 p0 = buffer0; 0431 0432 /* If the prefix is needed, find the prefix lines. */ 0433 if(!(no_diff_means_no_output && filevec[0].prefix_end == p0 && filevec[1].prefix_end == p1)) 0434 { 0435 end0 = filevec[0].prefix_end; 0436 while(p0 != end0) 0437 { 0438 GNULineRef l = lines++ & prefix_mask; 0439 if(l == alloc_lines0) 0440 { 0441 if((GNULineRef)(GNULINEREF_MAX / (2 * sizeof(ptrdiff_t))) <= alloc_lines0) 0442 xalloc_die(); 0443 alloc_lines0 *= 2; 0444 linbuf0 = (const QChar **)xrealloc(linbuf0, alloc_lines0 * sizeof(ptrdiff_t)); 0445 } 0446 linbuf0[l] = p0; 0447 while(p0 < pEnd0 && !Utils::isEndOfLine(*p0++)) 0448 continue; 0449 } 0450 } 0451 buffered_prefix = prefix_count && context < lines ? context : lines; 0452 0453 /* Allocate line buffer 1. */ 0454 0455 middle_guess = guess_lines(lines, p0 - buffer0, p1 - filevec[1].prefix_end); 0456 suffix_guess = guess_lines(lines, p0 - buffer0, buffer1 + n1 - p1); 0457 alloc_lines1 = buffered_prefix + middle_guess + std::min(context, suffix_guess); 0458 if(alloc_lines1 < buffered_prefix || (GNULineRef)(GNULINEREF_MAX / sizeof(ptrdiff_t)) <= alloc_lines1) 0459 xalloc_die(); 0460 linbuf1 = (const QChar **)xmalloc(alloc_lines1 * sizeof(ptrdiff_t)); 0461 0462 GNULineRef i; 0463 if(buffered_prefix != lines) 0464 { 0465 /* Rotate prefix lines to proper location. */ 0466 for(i = 0; i < buffered_prefix; ++i) 0467 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 0468 for(i = 0; i < buffered_prefix; ++i) 0469 linbuf0[i] = linbuf1[i]; 0470 } 0471 0472 /* Initialize line buffer 1 from line buffer 0. */ 0473 for(i = 0; i < buffered_prefix; ++i) 0474 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 0475 0476 /* Record the line buffer, adjusted so that 0477 linbuf[0] points at the first differing line. */ 0478 filevec[0].linbuf = linbuf0 + buffered_prefix; 0479 filevec[1].linbuf = linbuf1 + buffered_prefix; 0480 filevec[0].linbuf_base = filevec[1].linbuf_base = -buffered_prefix; 0481 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 0482 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 0483 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 0484 } 0485 0486 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 0487 than 2**k. This table is derived from Chris K. Caldwell's list 0488 <https://www.utm.edu/research/primes/lists/2small/>. */ 0489 0490 static unsigned char const prime_offset[] = 0491 { 0492 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 0493 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 0494 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 0495 55, 93, 1, 57, 25}; 0496 0497 /* Verify that this host's size_t is not too wide for the above table. */ 0498 0499 static_assert(sizeof(size_t) * CHAR_BIT <= sizeof prime_offset, "Not enough primes in table"); 0500 0501 /* Given a vector of two file_data objects, read the file associated 0502 with each one, and build the table of equivalence classes. 0503 Return nonzero if either file appears to be a binary file. 0504 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 0505 0506 bool GnuDiff::read_files(file_data filevec[], bool /*pretend_binary*/) 0507 { 0508 GNULineRef i; 0509 0510 find_identical_ends(filevec); 0511 0512 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 0513 if((GNULineRef)(GNULINEREF_MAX / sizeof(*equivs)) <= equivs_alloc) 0514 xalloc_die(); 0515 equivs = (equivclass *)xmalloc(equivs_alloc * sizeof(*equivs)); 0516 /* Equivalence class 0 is permanently safe for lines that were not 0517 hashed. Real equivalence classes start at 1. */ 0518 equivs_index = 1; 0519 0520 /* Allocate (one plus) a prime number of hash buckets. Use a prime 0521 number between 1/3 and 2/3 of the value of equiv_allocs, 0522 approximately. */ 0523 for(i = 9; ((GNULineRef)1 << i) < equivs_alloc / 3; ++i) 0524 continue; 0525 nbuckets = ((GNULineRef)1 << i) - prime_offset[i]; 0526 if(GNULINEREF_MAX / sizeof(*buckets) <= nbuckets) 0527 xalloc_die(); 0528 buckets = (GNULineRef *)zalloc((nbuckets + 1) * sizeof(*buckets)); 0529 buckets++; 0530 0531 for(i = 0; i < 2; ++i) 0532 find_and_hash_each_line(&filevec[i]); 0533 0534 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 0535 0536 free(equivs); 0537 free(buckets - 1); 0538 0539 return false; 0540 }