File indexing completed on 2024-10-27 05:18:19
0001 /* 0002 SPDX-FileCopyrightText: 2007 Andreas Pakulat <apaku@gmx.de> 0003 SPDX-FileCopyrightText: 2020 Jonathan Verner <jonathan.verner@matfyz.cz> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 #include "gitdiff.h" 0008 0009 #include <QDebug> 0010 #include <QRegularExpression> 0011 #include <QSharedData> 0012 #include <QString> 0013 #include <QUrl> 0014 0015 /* A class representing a diff hunk (a collection of localized changes) */ 0016 class DiffHunk 0017 { 0018 public: 0019 /* Metadata for the hunk */ 0020 uint srcStart /**< the 1-based (!) start line number of the range in the source file where the hunk applies */ 0021 , 0022 srcCount /**< the size of the range (in # of lines) in the source where the hunk applies (i.e. ctx lines + deleted lines)*/ 0023 , 0024 tgtStart /**< the 1-based (!) start line number of the range in the target file where the hunk applies */ 0025 , 0026 tgtCount /**< the size of the range (in # of lines) in the target where the hunk applies (i.e. ctx lines + deleted lines)*/ 0027 , 0028 headingLineIdx /**< The 0-based line number (in the whole diff) of the hunk header line (the one starting with `@@`) */ 0029 ; 0030 QString srcFile /**< The source filename */ 0031 , 0032 tgtFile /**< The target filename */ 0033 , 0034 heading /**< The heading of the hunk (the stuff in the header line after the position spec, i.e. after the second `@@`) */ 0035 ; 0036 QStringList lines; /**< The lines comprising the hunk (excluding the header) */ 0037 0038 /** 0039 * @returns the 0-based line number (in the whole diff) of the last line contained in the hunk. 0040 */ 0041 uint lastLineIdx() const 0042 { 0043 return headingLineIdx + lines.size(); 0044 } 0045 0046 /** 0047 * @param lineIdx the 0-based line number of the tested line in the whole diff 0048 * @returns true if the line is part of the diff and false otherwise 0049 * @note: Returns true also for the header line (the one starting with `@@`) 0050 */ 0051 bool containsDiffLine(uint lineIdx) const 0052 { 0053 return headingLineIdx <= lineIdx && lineIdx <= lastLineIdx(); 0054 } 0055 0056 /** 0057 * Returns the index of the line within the hunk 0058 * 0059 * @param diffLineIdx the 0-based indes of the line in the diff 0060 * 0061 * @note assumes that the line is contained within the hunk 0062 * @note if the line is a header line, -1 is returned; otherwise the returned 0063 * number is the index of the line in the `lines` list 0064 */ 0065 int diffLineToHunkLine(uint diffLineIdx) const 0066 { 0067 return diffLineIdx - (headingLineIdx + 1); 0068 } 0069 0070 /** 0071 * A helper method to construct a hunk header from the provided info 0072 * 0073 * A hunk header has the following form: 0074 * 0075 * @@ oldStart,oldCount newStart,newCount @@ heading 0076 * e.g. 0077 * @@ -36,14 +36,28 @@ public: 0078 * 0079 * @returns the hunk header 0080 */ 0081 static QString formatHeader(uint oldStart, uint oldCount, uint newStart, uint newCount, QString head); 0082 0083 /** 0084 * The following operators define a PARTIAL order on the hunks list. 0085 * A hunk H is strictly below a hunk K iff the endline of H is strictly below 0086 * the start line of K. In particular, the only non-overlapping hunks are 0087 * ordered. 0088 */ 0089 bool operator<(const DiffHunk &b) const 0090 { 0091 return lastLineIdx() < b.headingLineIdx; 0092 } 0093 bool operator<(uint line) const 0094 { 0095 return lastLineIdx() < line; 0096 } 0097 bool operator<=(const DiffHunk &b) const 0098 { 0099 return lastLineIdx() <= b.headingLineIdx; 0100 } 0101 bool operator<=(uint line) const 0102 { 0103 return lastLineIdx() <= line; 0104 } 0105 bool operator>(const DiffHunk &b) const 0106 { 0107 return headingLineIdx > b.lastLineIdx(); 0108 } 0109 bool operator>(uint line) const 0110 { 0111 return headingLineIdx > line; 0112 } 0113 bool operator>=(const DiffHunk &b) const 0114 { 0115 return headingLineIdx >= b.lastLineIdx(); 0116 } 0117 bool operator>=(uint line) const 0118 { 0119 return headingLineIdx >= line; 0120 } 0121 }; 0122 0123 /* RegExp matching a hunk header line */ 0124 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, HUNK_HEADER_RE, (QLatin1String("^@@ -([0-9,]+) \\+([0-9,]+) @@(.*)"))) 0125 // static const auto HUNK_HEADER_RE = QRegularExpression(QStringLiteral("^@@ -([0-9,]+) \\+([0-9,]+) @@(.*)")); 0126 0127 /* RegExp matching a meta line containing a source of target filename */ 0128 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, DIFF_FILENAME_RE, (QLatin1String("^[-+]{3} [ab]/(.*)"))) 0129 0130 /* RegExp matching a meta line (hunk header, filename, other info) */ 0131 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, META_LINE_RE, (QLatin1String("(^[-+]{3} )|^[^-+ ]"))) 0132 0133 /* RegExps matching conflict delimiting lines */ 0134 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_START_RE, (QLatin1String("^<<<<<<<"))) 0135 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_MID_RE, (QLatin1String("^======="))) 0136 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_END_RE, (QLatin1String("^>>>>>>>"))) 0137 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_RE, (QLatin1String("(^>>>>>>>)|(^=======)|(^<<<<<<<)"))) 0138 0139 QString formatRange(uint start, uint count) 0140 { 0141 if (count == 1) 0142 return QString().setNum(start); 0143 return QString().setNum(start) + QLatin1Char(',') + QString().setNum(count); 0144 } 0145 0146 std::pair<uint, uint> parseRange(const QString &range) 0147 { 0148 int commaPos = range.indexOf(QLatin1Char(',')); 0149 if (commaPos > -1) { 0150 return {QStringView(range).sliced(0, commaPos).toInt(), QStringView(range).sliced(commaPos + 1).toInt()}; 0151 } 0152 return {range.toInt(), 1}; 0153 } 0154 0155 /* Creates a hunk header line (starting with @@) 0156 * 0157 * Note: The line will not end with a newline */ 0158 QString DiffHunk::formatHeader(uint oldStart, uint oldCount, uint newStart, uint newCount, QString head) 0159 { 0160 return QLatin1String("@@ -") + formatRange(oldStart, oldCount) + QLatin1String(" +") + formatRange(newStart, newCount) + QLatin1String(" @@") + head; 0161 } 0162 0163 /** 0164 * Parses a unified diff into a list of "diff hunks" (each hunk starts with a 0165 * line starting with @@ and represents a collection of localized changes). 0166 * 0167 * @param diff a diff in git's unified diff format 0168 * @returns a list of hunk structures 0169 * 0170 * The diff is assumed to be a collection of hunks, where each hunk has the 0171 * following structure: 0172 * 0173 * METADATA 0174 * --- a/SOURCE_PATH 0175 * +++ b/TARGET_PATH 0176 * HUNK HEADER 0177 * HUNK CONTENT 0178 * 0179 * All metadata lines match the @ref:META_LINE_RE regexp (starts with anything 0180 * except for a '+', '-' and ' ') and these are discarded except for the hunk 0181 * header and source/target path specifications. The path specifications 0182 * are assumed to apply to all following hunks until a new path specification 0183 * is found and are stored in the srcFileName and tgtFileName attributes of 0184 * the hunk structure. 0185 * 0186 * 0187 * Hunk Header 0188 * ----------- 0189 * 0190 * The hunk header has the following form 0191 * 0192 * @@ -SRC_OFFSET[, SRC_CHANGES_COUNT] +TGT_OFFSET[, TGT_CHANGES_COUNT] @@ Heading 0193 * 0194 * where the SRC_OFFSET is a 1-based line index pointing to the source file where 0195 * the hunk applies and TGT_OFFSET is a 1-based line index pointing to the target 0196 * file where the hunk applies. These are parsed into the srcStart and tgtStart 0197 * attributes of the hunk structure. 0198 * 0199 * The optional SRC_CHANGES_COUNTS (assumed to be 1 if not present) specifies the 0200 * number of context lines (starting with ' ') plus the number of deleted lines 0201 * (starting with '-'). Similarly, the optional TGT_CHANGES_COUNT specifies the 0202 * number of context lines plus the number of added lines (starting with '+'). 0203 * These are parsed and stored in the srcCount and tgtCount attributes of the hunk 0204 * structure, but not checked (!). I.e. if the diff hunk has more/less changes then 0205 * specified, the returned hunk structure will have invalid src & tgt counts. 0206 * 0207 * Finally the Heading, used as a visual aid for users, is supposed to show the line 0208 * where the nearest enclosing function scope of the hunk starts. It is parsed and 0209 * stored in the heading attribute. 0210 * 0211 * Hunk Content 0212 * ------------ 0213 * 0214 * The hunk content is a collection of lines which 0215 * 0216 * 1) start with '+' (additions); or 0217 * 2) start with '-' (deletions); or 0218 * 3) start with ' ' (context lines); or 0219 * 4) are empty (context lines); or 0220 * 5) are within conflict markers 0221 * 0222 * These lines are parsed and stored in the lines attribute of the hunk structure. 0223 * The parsing of the hunk stops when a METADATA line (outside of conflict markers) 0224 * is encountered or the end of the file is reached. 0225 * 0226 * Conflict Markers 0227 * ---------------- 0228 * 0229 * Conflict markers are collections of lines of the form: 0230 * 0231 * >>>>>>> our ref 0232 * our content 0233 * ... 0234 * ======= 0235 * their content 0236 * ... 0237 * <<<<<<< their ref 0238 * 0239 * And show content which a merge was not able to merge automatically. 0240 * Strictly speaking, these should not appear in diffs, but git diff 0241 * generates them anyway for files with unresolved conflicts. 0242 */ 0243 std::vector<DiffHunk> parseHunks(VcsDiff &diff) 0244 { 0245 std::vector<DiffHunk> ret; 0246 int lineNo = -1; 0247 QString curSrcFileName, curTgtFileName; 0248 QStringListIterator lines(diff.diff().split(QLatin1Char('\n'))); 0249 while (lines.hasNext()) { 0250 lineNo++; 0251 auto curln = lines.next(); 0252 auto m = DIFF_FILENAME_RE->match(curln); 0253 if (m.hasMatch()) { 0254 if (curln.startsWith(QLatin1Char('-'))) 0255 curSrcFileName = m.captured(1); 0256 else if (curln.startsWith(QLatin1Char('+'))) 0257 curTgtFileName = m.captured(1); 0258 continue; 0259 } 0260 m = HUNK_HEADER_RE->match(curln); 0261 if (!m.hasMatch()) 0262 continue; 0263 const auto oldRange = parseRange(m.captured(1)); 0264 const auto newRange = parseRange(m.captured(2)); 0265 const auto heading = m.captured(3); 0266 uint firstLineIdx = lineNo; 0267 QStringList hunkLines; 0268 while (lines.hasNext() && (CONFLICT_START_RE->match(lines.peekNext()).hasMatch() || !META_LINE_RE->match(lines.peekNext()).hasMatch())) { 0269 // Consume the conflict 0270 if (CONFLICT_START_RE->match(lines.peekNext()).hasMatch()) { 0271 lineNo++; 0272 hunkLines << lines.next(); 0273 while (lines.hasNext() && !CONFLICT_END_RE->match(lines.peekNext()).hasMatch()) { 0274 lineNo++; 0275 hunkLines << lines.next(); 0276 } 0277 if (!CONFLICT_END_RE->match(lines.peekNext()).hasMatch()) { 0278 qWarning() << "Invalid diff format, end of file reached before conflict finished"; 0279 qDebug() << diff.diff(); 0280 break; 0281 } 0282 } 0283 lineNo++; 0284 hunkLines << lines.next(); 0285 } 0286 0287 // The number of filenames present in the diff should match the number 0288 // of hunks 0289 ret.push_back( 0290 DiffHunk{oldRange.first, oldRange.second, newRange.first, newRange.second, firstLineIdx, curSrcFileName, curTgtFileName, heading, hunkLines}); 0291 } 0292 0293 // If the diff ends with a newline, for the last hunk, when splitting into lines above 0294 // we will always get an empty string at the end, which we now remove 0295 if (diff.diff().endsWith(QLatin1Char('\n'))) { 0296 if (ret.size() > 0 && ret.back().lines.size() > 0) { 0297 ret.back().lines.pop_back(); 0298 } else { 0299 qWarning() << "Failed to parse a diff, produced no hunks"; 0300 qDebug() << "Failed diff:" << diff.diff(); 0301 } 0302 } 0303 0304 return ret; 0305 } 0306 0307 class VcsDiffPrivate 0308 { 0309 public: 0310 QUrl baseDiff; 0311 QString diff; 0312 uint depth = 0; 0313 std::vector<DiffHunk> hunks; 0314 0315 enum Dest { 0316 SRC = '-', 0317 TGT = '+', 0318 }; 0319 0320 /** 0321 * Maps a line position in the diff to a corresponding line position in the destination file. 0322 * 0323 * @param line a 0-based line position in the diff 0324 * @param dest specifies the destination file to map to: 0325 * either SRC (the source file, '-') or TGT (the target file, '+') 0326 * @returns the 0-based line position in the destination file or -1 if no such position exists. 0327 */ 0328 int mapDiffLine(const uint line, const Dest dest) const 0329 { 0330 const QLatin1Char skipChar = (dest == SRC) ? QLatin1Char(TGT) : QLatin1Char(SRC); 0331 for (const auto &h : hunks) { 0332 if (h.containsDiffLine(line)) { 0333 int hunkPos = h.diffLineToHunkLine(line); 0334 0335 // The line refers to the heading line 0336 if (hunkPos < 0) 0337 return -1; 0338 0339 // Any lines in the diff hunk which come before line and come from the opposite 0340 // of dest should not be counted (they are not present in the dest) 0341 int skipCount = 0; 0342 for (int i = 0; i < hunkPos; i++) { 0343 if (h.lines.at(i).startsWith(skipChar)) 0344 skipCount++; 0345 } 0346 0347 // Any lines in the diff hunk which come from the second part (src)/ first part (tgt) 0348 // of a conflict should not be counted either 0349 bool inConflict = false; // This is set so that a line inside a conflict is recognized as a valid line 0350 for (int i = 0; i < hunkPos; i++) { 0351 if (CONFLICT_START_RE->match(h.lines.at(i)).hasMatch()) { 0352 skipCount++; // skip the conflict marker line 0353 if (dest == TGT) { 0354 while ((++i) < hunkPos && !CONFLICT_MID_RE->match(h.lines.at(i)).hasMatch()) { 0355 skipCount++; 0356 } 0357 } else { 0358 inConflict = true; 0359 } 0360 } 0361 if (CONFLICT_MID_RE->match(h.lines.at(i)).hasMatch()) { 0362 skipCount++; // skip the conflict marker line 0363 if (dest == SRC) { 0364 while ((++i) < hunkPos && !CONFLICT_END_RE->match(h.lines.at(i)).hasMatch()) 0365 skipCount++; 0366 } else { 0367 inConflict = true; 0368 } 0369 } 0370 if (CONFLICT_END_RE->match(h.lines.at(i)).hasMatch()) { 0371 skipCount++; // skip the conflict marker line 0372 inConflict = false; 0373 } 0374 } 0375 0376 auto ln = h.lines[hunkPos]; 0377 0378 // This works around the fact that inConflict is set even if hunkPos 0379 // ends up hitting a conflict marker 0380 if (CONFLICT_RE->match(ln).hasMatch()) 0381 return -1; 0382 0383 if (ln.startsWith(QLatin1Char(dest)) || ln.startsWith(QLatin1Char(' ')) || ln.isEmpty() || inConflict) { 0384 if (dest == SRC) 0385 // The -1 accounts for the fact that srcStart is 1-based 0386 // but we need to return 0-based line numbers 0387 return h.srcStart - 1 + hunkPos - skipCount; 0388 else 0389 // The -1 accounts for the fact that srcStart is 1-based 0390 // but we need to return 0-based line numbers 0391 return h.tgtStart - 1 + hunkPos - skipCount; 0392 } else 0393 return -1; 0394 } 0395 } 0396 return -1; 0397 } 0398 }; 0399 0400 VcsDiff VcsDiff::subDiffHunk(const uint line, DiffDirection dir) const 0401 { 0402 for (const auto &hunk : d->hunks) { 0403 if (hunk.containsDiffLine(line)) { 0404 return subDiff(hunk.headingLineIdx, hunk.lastLineIdx(), dir); 0405 } 0406 } 0407 0408 VcsDiff emptyDiff; 0409 emptyDiff.setBaseDiff(d->baseDiff); 0410 emptyDiff.setDepth(d->depth); 0411 emptyDiff.setDiff(d->diff.mid(0, d->diff.indexOf(QStringLiteral("@@")))); 0412 return emptyDiff; 0413 } 0414 0415 VcsDiff VcsDiff::subDiff(const uint startLine, const uint endLine, DiffDirection dir) const 0416 { 0417 // Code adapted from cola/diffparse.py 0418 enum LineType { ADD = '+', DEL = '-', CTX = ' ', NO_NEWLINE = '\\' }; 0419 0420 VcsDiff ret; 0421 ret.setBaseDiff(baseDiff()); 0422 ret.setDepth(depth()); 0423 0424 QStringList lines; 0425 for (const auto &hunk : d->hunks) { 0426 // Skip hunks before the first line 0427 if (hunk < startLine) 0428 continue; 0429 0430 // Skip hunks after the last line 0431 if (hunk > endLine) 0432 break; 0433 0434 std::map<LineType, int> counts = {{ADD, 0}, {DEL, 0}, {CTX, 0}, {NO_NEWLINE, 0}}; 0435 QStringList filteredLines; 0436 0437 // Will be set if the previous line in a hunk was 0438 // skipped because it was not in the selected range 0439 bool prevSkipped = false; 0440 0441 uint lnIdx = hunk.headingLineIdx; 0442 0443 // Store the number of skipped lines which start the hunk 0444 // (i.e. lines before a first deletion (addition in case of reverse) 0445 // so that we can adjust the start appropriately 0446 int startOffset = 0; 0447 const auto _lines = QStringList(hunk.lines.constBegin(), hunk.lines.constEnd()); 0448 for (const auto &line : _lines) { 0449 lnIdx++; 0450 LineType tp = line.length() > 0 ? (LineType)line[0].toLatin1() : (LineType)0; 0451 QString content = line.mid(1); 0452 0453 if (dir == Reverse) { 0454 if (tp == ADD) 0455 tp = DEL; 0456 else if (tp == DEL) 0457 tp = ADD; 0458 } 0459 0460 if (lnIdx < startLine || endLine < lnIdx) { 0461 // skip additions (or deletions if reverse) that are not in range 0462 if (tp == ADD) { 0463 prevSkipped = true; 0464 // If we are before the range and 0465 // so far we only encountered ADD (or DEL, in case of reverse) lines 0466 // these will not be included in the subdiff hunk so we increase the startOffset 0467 if (lnIdx < startLine && counts[CTX] == 0) 0468 startOffset++; 0469 continue; 0470 } 0471 // change deletions (or additions if reverse) that are not in range into context 0472 if (tp == DEL) 0473 tp = CTX; 0474 } 0475 0476 // If the line immediately before a "No newline" line was 0477 // skipped (because it was an unselected addition) skip 0478 // the "No newline" line as well. 0479 if (tp == NO_NEWLINE && prevSkipped) { 0480 if (lnIdx <= endLine) 0481 startOffset++; 0482 continue; 0483 } 0484 0485 // Empty lines are context lines and we 0486 // preserve them 0487 if ((int)tp == 0) { 0488 filteredLines << content; 0489 tp = CTX; 0490 } else { 0491 filteredLines << QLatin1Char(tp) + content; 0492 } 0493 counts[tp]++; 0494 prevSkipped = false; 0495 } 0496 0497 // Skip hunks which have only context lines 0498 if (counts[ADD] + counts[DEL] == 0) 0499 continue; 0500 0501 // Compute the start & counts of the hunks 0502 uint subSrcStart, subTgtStart; 0503 if (dir == Reverse) { 0504 subSrcStart = hunk.tgtStart + startOffset; 0505 subTgtStart = hunk.srcStart + startOffset; 0506 } else { 0507 subSrcStart = hunk.srcStart + startOffset; 0508 subTgtStart = hunk.tgtStart + startOffset; 0509 } 0510 uint subSrcCount = counts[CTX] + counts[DEL]; 0511 uint subTgtCount = counts[CTX] + counts[ADD]; 0512 0513 // Prepend lines identifying the source files 0514 lines << QStringLiteral("--- a/") + ((dir == Reverse) ? hunk.tgtFile : hunk.srcFile); 0515 lines << QStringLiteral("+++ b/") + ((dir == Reverse) ? hunk.srcFile : hunk.tgtFile); 0516 0517 lines << DiffHunk::formatHeader(subSrcStart, subSrcCount, subTgtStart, subTgtCount, hunk.heading); 0518 lines += filteredLines; 0519 } 0520 if (lines.size() > 2) 0521 ret.setDiff(lines.join(QLatin1Char('\n')) + QLatin1Char('\n')); 0522 return ret; 0523 } 0524 0525 int VcsDiff::diffLineToSourceLine(const uint line) const 0526 { 0527 return d->mapDiffLine(line, VcsDiffPrivate::SRC); 0528 } 0529 0530 int VcsDiff::diffLineToTargetLine(const uint line) const 0531 { 0532 return d->mapDiffLine(line, VcsDiffPrivate::TGT); 0533 } 0534 0535 VcsDiff::VcsDiff() 0536 : d(new VcsDiffPrivate) 0537 { 0538 } 0539 0540 VcsDiff::~VcsDiff() = default; 0541 0542 VcsDiff::VcsDiff(VcsDiff &&rhs) 0543 { 0544 this->d = std::move(rhs.d); 0545 } 0546 0547 bool VcsDiff::isEmpty() const 0548 { 0549 return d->diff.isEmpty(); 0550 } 0551 0552 QString VcsDiff::diff() const 0553 { 0554 return d->diff; 0555 } 0556 0557 void VcsDiff::setDiff(const QString &s) 0558 { 0559 d->diff = s; 0560 d->hunks = parseHunks(*this); 0561 } 0562 0563 QUrl VcsDiff::baseDiff() const 0564 { 0565 return d->baseDiff; 0566 } 0567 0568 uint VcsDiff::depth() const 0569 { 0570 return d->depth; 0571 } 0572 0573 void VcsDiff::setBaseDiff(const QUrl &url) 0574 { 0575 d->baseDiff = url; 0576 } 0577 0578 void VcsDiff::setDepth(const uint depth) 0579 { 0580 d->depth = depth; 0581 }