File indexing completed on 2024-10-27 05:18:19

0001 /*
0002     SPDX-FileCopyrightText: 2007 Andreas Pakulat <apaku@gmx.de>
0003     SPDX-FileCopyrightText: 2020 Jonathan Verner <jonathan.verner@matfyz.cz>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 #include "gitdiff.h"
0008 
0009 #include <QDebug>
0010 #include <QRegularExpression>
0011 #include <QSharedData>
0012 #include <QString>
0013 #include <QUrl>
0014 
0015 /* A class representing a diff hunk (a collection of localized changes) */
0016 class DiffHunk
0017 {
0018 public:
0019     /* Metadata for the hunk */
0020     uint srcStart /**< the 1-based (!) start line number of the range in the source file where the hunk applies */
0021         ,
0022         srcCount /**< the size of the range (in # of lines) in the source where the hunk applies  (i.e. ctx lines + deleted lines)*/
0023         ,
0024         tgtStart /**< the 1-based (!) start line number of the range in the target file where the hunk applies */
0025         ,
0026         tgtCount /**< the size of the range (in # of lines) in the target where the hunk applies  (i.e. ctx lines + deleted lines)*/
0027         ,
0028         headingLineIdx /**< The 0-based line number (in the whole diff) of the hunk header line (the one starting with `@@`) */
0029         ;
0030     QString srcFile /**< The source filename */
0031         ,
0032         tgtFile /**< The target filename */
0033         ,
0034         heading /**< The heading of the hunk (the stuff in the header line after the position spec, i.e. after the second `@@`) */
0035         ;
0036     QStringList lines; /**< The lines comprising the hunk (excluding the header) */
0037 
0038     /**
0039      * @returns the 0-based line number (in the whole diff) of the last line contained in the hunk.
0040      */
0041     uint lastLineIdx() const
0042     {
0043         return headingLineIdx + lines.size();
0044     }
0045 
0046     /**
0047      * @param lineIdx the 0-based line number of the tested line in the whole diff
0048      * @returns true if the line is part of the diff and false otherwise
0049      * @note: Returns true also for the header line (the one starting with `@@`)
0050      */
0051     bool containsDiffLine(uint lineIdx) const
0052     {
0053         return headingLineIdx <= lineIdx && lineIdx <= lastLineIdx();
0054     }
0055 
0056     /**
0057      * Returns the index of the line within the hunk
0058      *
0059      * @param diffLineIdx the 0-based indes of the line in the diff
0060      *
0061      * @note assumes that the line is contained within the hunk
0062      * @note if the line is a header line, -1 is returned; otherwise the returned
0063      * number is the index of the line in the `lines` list
0064      */
0065     int diffLineToHunkLine(uint diffLineIdx) const
0066     {
0067         return diffLineIdx - (headingLineIdx + 1);
0068     }
0069 
0070     /**
0071      * A helper method to construct a hunk header from the provided info
0072      *
0073      * A hunk header has the following form:
0074      *
0075      *      @@ oldStart,oldCount newStart,newCount @@ heading
0076      * e.g.
0077      *      @@ -36,14 +36,28 @@ public:
0078      *
0079      * @returns the hunk header
0080      */
0081     static QString formatHeader(uint oldStart, uint oldCount, uint newStart, uint newCount, QString head);
0082 
0083     /**
0084      * The following operators define a PARTIAL order on the hunks list.
0085      * A hunk H is strictly below a hunk K iff the endline of H is strictly below
0086      * the start line of K. In particular, the only non-overlapping hunks are
0087      * ordered.
0088      */
0089     bool operator<(const DiffHunk &b) const
0090     {
0091         return lastLineIdx() < b.headingLineIdx;
0092     }
0093     bool operator<(uint line) const
0094     {
0095         return lastLineIdx() < line;
0096     }
0097     bool operator<=(const DiffHunk &b) const
0098     {
0099         return lastLineIdx() <= b.headingLineIdx;
0100     }
0101     bool operator<=(uint line) const
0102     {
0103         return lastLineIdx() <= line;
0104     }
0105     bool operator>(const DiffHunk &b) const
0106     {
0107         return headingLineIdx > b.lastLineIdx();
0108     }
0109     bool operator>(uint line) const
0110     {
0111         return headingLineIdx > line;
0112     }
0113     bool operator>=(const DiffHunk &b) const
0114     {
0115         return headingLineIdx >= b.lastLineIdx();
0116     }
0117     bool operator>=(uint line) const
0118     {
0119         return headingLineIdx >= line;
0120     }
0121 };
0122 
0123 /* RegExp matching a hunk header line */
0124 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, HUNK_HEADER_RE, (QLatin1String("^@@ -([0-9,]+) \\+([0-9,]+) @@(.*)")))
0125 // static const auto HUNK_HEADER_RE = QRegularExpression(QStringLiteral("^@@ -([0-9,]+) \\+([0-9,]+) @@(.*)"));
0126 
0127 /* RegExp matching a meta line containing a source of target filename */
0128 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, DIFF_FILENAME_RE, (QLatin1String("^[-+]{3} [ab]/(.*)")))
0129 
0130 /* RegExp matching a meta line (hunk header, filename, other info) */
0131 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, META_LINE_RE, (QLatin1String("(^[-+]{3} )|^[^-+ ]")))
0132 
0133 /* RegExps matching conflict delimiting lines */
0134 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_START_RE, (QLatin1String("^<<<<<<<")))
0135 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_MID_RE, (QLatin1String("^=======")))
0136 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_END_RE, (QLatin1String("^>>>>>>>")))
0137 Q_GLOBAL_STATIC_WITH_ARGS(const QRegularExpression, CONFLICT_RE, (QLatin1String("(^>>>>>>>)|(^=======)|(^<<<<<<<)")))
0138 
0139 QString formatRange(uint start, uint count)
0140 {
0141     if (count == 1)
0142         return QString().setNum(start);
0143     return QString().setNum(start) + QLatin1Char(',') + QString().setNum(count);
0144 }
0145 
0146 std::pair<uint, uint> parseRange(const QString &range)
0147 {
0148     int commaPos = range.indexOf(QLatin1Char(','));
0149     if (commaPos > -1) {
0150         return {QStringView(range).sliced(0, commaPos).toInt(), QStringView(range).sliced(commaPos + 1).toInt()};
0151     }
0152     return {range.toInt(), 1};
0153 }
0154 
0155 /* Creates a hunk header line (starting with @@)
0156  *
0157  * Note: The line will not end with a newline */
0158 QString DiffHunk::formatHeader(uint oldStart, uint oldCount, uint newStart, uint newCount, QString head)
0159 {
0160     return QLatin1String("@@ -") + formatRange(oldStart, oldCount) + QLatin1String(" +") + formatRange(newStart, newCount) + QLatin1String(" @@") + head;
0161 }
0162 
0163 /**
0164  * Parses a unified diff into a list of "diff hunks" (each hunk starts with a
0165  * line starting with @@ and represents a collection of localized changes).
0166  *
0167  * @param diff a diff in git's unified diff format
0168  * @returns a list of hunk structures
0169  *
0170  * The diff is assumed to be a collection of hunks, where each hunk has the
0171  * following structure:
0172  *
0173  *   METADATA
0174  *   --- a/SOURCE_PATH
0175  *   +++ b/TARGET_PATH
0176  *   HUNK HEADER
0177  *   HUNK CONTENT
0178  *
0179  * All metadata lines match the @ref:META_LINE_RE regexp (starts with anything
0180  * except for a '+', '-' and ' ') and these are discarded except for the hunk
0181  * header and source/target path specifications. The path specifications
0182  * are assumed to apply to all following hunks until a new path specification
0183  * is found and are stored in the srcFileName and tgtFileName attributes of
0184  * the hunk structure.
0185  *
0186  *
0187  * Hunk Header
0188  * -----------
0189  *
0190  * The hunk header has the following form
0191  *
0192  *   @@ -SRC_OFFSET[, SRC_CHANGES_COUNT] +TGT_OFFSET[, TGT_CHANGES_COUNT] @@ Heading
0193  *
0194  * where the SRC_OFFSET is a 1-based line index pointing to the source file where
0195  * the hunk applies and TGT_OFFSET is a 1-based line index pointing to the target
0196  * file where the hunk applies. These are parsed into the srcStart and tgtStart
0197  * attributes of the hunk structure.
0198  *
0199  * The optional SRC_CHANGES_COUNTS (assumed to be 1 if not present) specifies the
0200  * number of context lines (starting with ' ') plus the number of deleted lines
0201  * (starting with '-'). Similarly, the optional TGT_CHANGES_COUNT specifies the
0202  * number of context lines plus the number of added lines (starting with '+').
0203  * These are parsed and stored in the srcCount and tgtCount attributes of the hunk
0204  * structure, but not checked (!). I.e. if the diff hunk has more/less changes then
0205  * specified, the returned hunk structure will have invalid src & tgt counts.
0206  *
0207  * Finally the Heading, used as a visual aid for users, is supposed to show the line
0208  * where the nearest enclosing function scope of the hunk starts. It is parsed and
0209  * stored in the heading attribute.
0210  *
0211  * Hunk Content
0212  * ------------
0213  *
0214  * The hunk content is a collection of lines which
0215  *
0216  *   1) start with '+' (additions); or
0217  *   2) start with '-' (deletions); or
0218  *   3) start with ' ' (context lines); or
0219  *   4) are empty (context lines); or
0220  *   5) are within conflict markers
0221  *
0222  * These lines are parsed and stored in the lines attribute of the hunk structure.
0223  * The parsing of the hunk stops when a METADATA line (outside of conflict markers)
0224  * is encountered or the end of the file is reached.
0225  *
0226  * Conflict Markers
0227  * ----------------
0228  *
0229  * Conflict markers are collections of lines of the form:
0230  *
0231  *   >>>>>>> our ref
0232  *   our content
0233  *   ...
0234  *   =======
0235  *   their content
0236  *   ...
0237  *   <<<<<<< their ref
0238  *
0239  * And show content which a merge was not able to merge automatically.
0240  * Strictly speaking, these should not appear in diffs, but git diff
0241  * generates them anyway for files with unresolved conflicts.
0242  */
0243 std::vector<DiffHunk> parseHunks(VcsDiff &diff)
0244 {
0245     std::vector<DiffHunk> ret;
0246     int lineNo = -1;
0247     QString curSrcFileName, curTgtFileName;
0248     QStringListIterator lines(diff.diff().split(QLatin1Char('\n')));
0249     while (lines.hasNext()) {
0250         lineNo++;
0251         auto curln = lines.next();
0252         auto m = DIFF_FILENAME_RE->match(curln);
0253         if (m.hasMatch()) {
0254             if (curln.startsWith(QLatin1Char('-')))
0255                 curSrcFileName = m.captured(1);
0256             else if (curln.startsWith(QLatin1Char('+')))
0257                 curTgtFileName = m.captured(1);
0258             continue;
0259         }
0260         m = HUNK_HEADER_RE->match(curln);
0261         if (!m.hasMatch())
0262             continue;
0263         const auto oldRange = parseRange(m.captured(1));
0264         const auto newRange = parseRange(m.captured(2));
0265         const auto heading = m.captured(3);
0266         uint firstLineIdx = lineNo;
0267         QStringList hunkLines;
0268         while (lines.hasNext() && (CONFLICT_START_RE->match(lines.peekNext()).hasMatch() || !META_LINE_RE->match(lines.peekNext()).hasMatch())) {
0269             // Consume the conflict
0270             if (CONFLICT_START_RE->match(lines.peekNext()).hasMatch()) {
0271                 lineNo++;
0272                 hunkLines << lines.next();
0273                 while (lines.hasNext() && !CONFLICT_END_RE->match(lines.peekNext()).hasMatch()) {
0274                     lineNo++;
0275                     hunkLines << lines.next();
0276                 }
0277                 if (!CONFLICT_END_RE->match(lines.peekNext()).hasMatch()) {
0278                     qWarning() << "Invalid diff format, end of file reached before conflict finished";
0279                     qDebug() << diff.diff();
0280                     break;
0281                 }
0282             }
0283             lineNo++;
0284             hunkLines << lines.next();
0285         }
0286 
0287         // The number of filenames present in the diff should match the number
0288         // of hunks
0289         ret.push_back(
0290             DiffHunk{oldRange.first, oldRange.second, newRange.first, newRange.second, firstLineIdx, curSrcFileName, curTgtFileName, heading, hunkLines});
0291     }
0292 
0293     // If the diff ends with a newline, for the last hunk, when splitting into lines above
0294     // we will always get an empty string at the end, which we now remove
0295     if (diff.diff().endsWith(QLatin1Char('\n'))) {
0296         if (ret.size() > 0 && ret.back().lines.size() > 0) {
0297             ret.back().lines.pop_back();
0298         } else {
0299             qWarning() << "Failed to parse a diff, produced no hunks";
0300             qDebug() << "Failed diff:" << diff.diff();
0301         }
0302     }
0303 
0304     return ret;
0305 }
0306 
0307 class VcsDiffPrivate
0308 {
0309 public:
0310     QUrl baseDiff;
0311     QString diff;
0312     uint depth = 0;
0313     std::vector<DiffHunk> hunks;
0314 
0315     enum Dest {
0316         SRC = '-',
0317         TGT = '+',
0318     };
0319 
0320     /**
0321      * Maps a line position in the diff to a corresponding line position in the destination file.
0322      *
0323      * @param line a 0-based line position in the diff
0324      * @param dest specifies the destination file to map to:
0325      *             either SRC (the source file, '-') or TGT (the target file, '+')
0326      * @returns the 0-based line position in the destination file or -1 if no such position exists.
0327      */
0328     int mapDiffLine(const uint line, const Dest dest) const
0329     {
0330         const QLatin1Char skipChar = (dest == SRC) ? QLatin1Char(TGT) : QLatin1Char(SRC);
0331         for (const auto &h : hunks) {
0332             if (h.containsDiffLine(line)) {
0333                 int hunkPos = h.diffLineToHunkLine(line);
0334 
0335                 // The line refers to the heading line
0336                 if (hunkPos < 0)
0337                     return -1;
0338 
0339                 // Any lines in the diff hunk which come before line and come from the opposite
0340                 // of dest should not be counted (they are not present in the dest)
0341                 int skipCount = 0;
0342                 for (int i = 0; i < hunkPos; i++) {
0343                     if (h.lines.at(i).startsWith(skipChar))
0344                         skipCount++;
0345                 }
0346 
0347                 // Any lines in the diff hunk which come from the second part (src)/ first part (tgt)
0348                 // of a conflict should not be counted either
0349                 bool inConflict = false; // This is set so that a line inside a conflict is recognized as a valid line
0350                 for (int i = 0; i < hunkPos; i++) {
0351                     if (CONFLICT_START_RE->match(h.lines.at(i)).hasMatch()) {
0352                         skipCount++; // skip the conflict marker line
0353                         if (dest == TGT) {
0354                             while ((++i) < hunkPos && !CONFLICT_MID_RE->match(h.lines.at(i)).hasMatch()) {
0355                                 skipCount++;
0356                             }
0357                         } else {
0358                             inConflict = true;
0359                         }
0360                     }
0361                     if (CONFLICT_MID_RE->match(h.lines.at(i)).hasMatch()) {
0362                         skipCount++; // skip the conflict marker line
0363                         if (dest == SRC) {
0364                             while ((++i) < hunkPos && !CONFLICT_END_RE->match(h.lines.at(i)).hasMatch())
0365                                 skipCount++;
0366                         } else {
0367                             inConflict = true;
0368                         }
0369                     }
0370                     if (CONFLICT_END_RE->match(h.lines.at(i)).hasMatch()) {
0371                         skipCount++; // skip the conflict marker line
0372                         inConflict = false;
0373                     }
0374                 }
0375 
0376                 auto ln = h.lines[hunkPos];
0377 
0378                 // This works around the fact that inConflict is set even if hunkPos
0379                 // ends up hitting a conflict marker
0380                 if (CONFLICT_RE->match(ln).hasMatch())
0381                     return -1;
0382 
0383                 if (ln.startsWith(QLatin1Char(dest)) || ln.startsWith(QLatin1Char(' ')) || ln.isEmpty() || inConflict) {
0384                     if (dest == SRC)
0385                         // The -1 accounts for the fact that srcStart is 1-based
0386                         // but we need to return 0-based line numbers
0387                         return h.srcStart - 1 + hunkPos - skipCount;
0388                     else
0389                         // The -1 accounts for the fact that srcStart is 1-based
0390                         // but we need to return 0-based line numbers
0391                         return h.tgtStart - 1 + hunkPos - skipCount;
0392                 } else
0393                     return -1;
0394             }
0395         }
0396         return -1;
0397     }
0398 };
0399 
0400 VcsDiff VcsDiff::subDiffHunk(const uint line, DiffDirection dir) const
0401 {
0402     for (const auto &hunk : d->hunks) {
0403         if (hunk.containsDiffLine(line)) {
0404             return subDiff(hunk.headingLineIdx, hunk.lastLineIdx(), dir);
0405         }
0406     }
0407 
0408     VcsDiff emptyDiff;
0409     emptyDiff.setBaseDiff(d->baseDiff);
0410     emptyDiff.setDepth(d->depth);
0411     emptyDiff.setDiff(d->diff.mid(0, d->diff.indexOf(QStringLiteral("@@"))));
0412     return emptyDiff;
0413 }
0414 
0415 VcsDiff VcsDiff::subDiff(const uint startLine, const uint endLine, DiffDirection dir) const
0416 {
0417     // Code adapted from cola/diffparse.py
0418     enum LineType { ADD = '+', DEL = '-', CTX = ' ', NO_NEWLINE = '\\' };
0419 
0420     VcsDiff ret;
0421     ret.setBaseDiff(baseDiff());
0422     ret.setDepth(depth());
0423 
0424     QStringList lines;
0425     for (const auto &hunk : d->hunks) {
0426         // Skip hunks before the first line
0427         if (hunk < startLine)
0428             continue;
0429 
0430         // Skip hunks after the last line
0431         if (hunk > endLine)
0432             break;
0433 
0434         std::map<LineType, int> counts = {{ADD, 0}, {DEL, 0}, {CTX, 0}, {NO_NEWLINE, 0}};
0435         QStringList filteredLines;
0436 
0437         // Will be set if the previous line in a hunk was
0438         // skipped because it was not in the selected range
0439         bool prevSkipped = false;
0440 
0441         uint lnIdx = hunk.headingLineIdx;
0442 
0443         // Store the number of skipped lines which start the hunk
0444         // (i.e. lines before a first deletion (addition in case of reverse)
0445         // so that we can adjust the start appropriately
0446         int startOffset = 0;
0447         const auto _lines = QStringList(hunk.lines.constBegin(), hunk.lines.constEnd());
0448         for (const auto &line : _lines) {
0449             lnIdx++;
0450             LineType tp = line.length() > 0 ? (LineType)line[0].toLatin1() : (LineType)0;
0451             QString content = line.mid(1);
0452 
0453             if (dir == Reverse) {
0454                 if (tp == ADD)
0455                     tp = DEL;
0456                 else if (tp == DEL)
0457                     tp = ADD;
0458             }
0459 
0460             if (lnIdx < startLine || endLine < lnIdx) {
0461                 // skip additions (or deletions if reverse) that are not in range
0462                 if (tp == ADD) {
0463                     prevSkipped = true;
0464                     // If we are before the range and
0465                     // so far we only encountered ADD (or DEL, in case of reverse) lines
0466                     // these will not be included in the subdiff hunk so we increase the startOffset
0467                     if (lnIdx < startLine && counts[CTX] == 0)
0468                         startOffset++;
0469                     continue;
0470                 }
0471                 // change deletions (or additions if reverse) that are not in range into context
0472                 if (tp == DEL)
0473                     tp = CTX;
0474             }
0475 
0476             // If the line immediately before a "No newline" line was
0477             // skipped (because it was an unselected addition) skip
0478             // the "No newline" line as well.
0479             if (tp == NO_NEWLINE && prevSkipped) {
0480                 if (lnIdx <= endLine)
0481                     startOffset++;
0482                 continue;
0483             }
0484 
0485             // Empty lines are context lines and we
0486             // preserve them
0487             if ((int)tp == 0) {
0488                 filteredLines << content;
0489                 tp = CTX;
0490             } else {
0491                 filteredLines << QLatin1Char(tp) + content;
0492             }
0493             counts[tp]++;
0494             prevSkipped = false;
0495         }
0496 
0497         // Skip hunks which have only context lines
0498         if (counts[ADD] + counts[DEL] == 0)
0499             continue;
0500 
0501         // Compute the start & counts of the hunks
0502         uint subSrcStart, subTgtStart;
0503         if (dir == Reverse) {
0504             subSrcStart = hunk.tgtStart + startOffset;
0505             subTgtStart = hunk.srcStart + startOffset;
0506         } else {
0507             subSrcStart = hunk.srcStart + startOffset;
0508             subTgtStart = hunk.tgtStart + startOffset;
0509         }
0510         uint subSrcCount = counts[CTX] + counts[DEL];
0511         uint subTgtCount = counts[CTX] + counts[ADD];
0512 
0513         // Prepend lines identifying the source files
0514         lines << QStringLiteral("--- a/") + ((dir == Reverse) ? hunk.tgtFile : hunk.srcFile);
0515         lines << QStringLiteral("+++ b/") + ((dir == Reverse) ? hunk.srcFile : hunk.tgtFile);
0516 
0517         lines << DiffHunk::formatHeader(subSrcStart, subSrcCount, subTgtStart, subTgtCount, hunk.heading);
0518         lines += filteredLines;
0519     }
0520     if (lines.size() > 2)
0521         ret.setDiff(lines.join(QLatin1Char('\n')) + QLatin1Char('\n'));
0522     return ret;
0523 }
0524 
0525 int VcsDiff::diffLineToSourceLine(const uint line) const
0526 {
0527     return d->mapDiffLine(line, VcsDiffPrivate::SRC);
0528 }
0529 
0530 int VcsDiff::diffLineToTargetLine(const uint line) const
0531 {
0532     return d->mapDiffLine(line, VcsDiffPrivate::TGT);
0533 }
0534 
0535 VcsDiff::VcsDiff()
0536     : d(new VcsDiffPrivate)
0537 {
0538 }
0539 
0540 VcsDiff::~VcsDiff() = default;
0541 
0542 VcsDiff::VcsDiff(VcsDiff &&rhs)
0543 {
0544     this->d = std::move(rhs.d);
0545 }
0546 
0547 bool VcsDiff::isEmpty() const
0548 {
0549     return d->diff.isEmpty();
0550 }
0551 
0552 QString VcsDiff::diff() const
0553 {
0554     return d->diff;
0555 }
0556 
0557 void VcsDiff::setDiff(const QString &s)
0558 {
0559     d->diff = s;
0560     d->hunks = parseHunks(*this);
0561 }
0562 
0563 QUrl VcsDiff::baseDiff() const
0564 {
0565     return d->baseDiff;
0566 }
0567 
0568 uint VcsDiff::depth() const
0569 {
0570     return d->depth;
0571 }
0572 
0573 void VcsDiff::setBaseDiff(const QUrl &url)
0574 {
0575     d->baseDiff = url;
0576 }
0577 
0578 void VcsDiff::setDepth(const uint depth)
0579 {
0580     d->depth = depth;
0581 }