File indexing completed on 2024-04-14 05:37:44

0001 /*
0002  *  SPDX-FileCopyrightText: 2019  Andreas Cord-Landwehr <cordlandwehr@kde.org>
0003  *
0004  *  SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0005  */
0006 
0007 #include "directoryparser.h"
0008 #include "skipparser.h"
0009 #include <QDebug>
0010 #include <QDirIterator>
0011 #include <QTextStream>
0012 #include <QVector>
0013 
0014 const QStringList DirectoryParser::s_supportedExtensions = {".cpp",  ".cc", ".c", ".h",  ".css",  ".hpp", ".qml", ".cmake", "CMakeLists.txt", ".in",  ".py", ".frag", ".vert",
0015                                                             ".glsl", "php", "sh", ".mm", ".java", ".kt",  ".js",  ".xml",   ".xsd",           ".xsl", ".pl", ".rb",   ".docbook"};
0016 
0017 bool shallIgnoreFile(const QDirIterator &iterator, const QRegularExpression &fileToIgnorePattern)
0018 {
0019     QFileInfo fileInfo(iterator.fileInfo());
0020     return !fileInfo.isFile() or (!fileToIgnorePattern.pattern().isEmpty() && fileToIgnorePattern.match(fileInfo.filePath()).hasMatch());
0021 }
0022 
0023 void DirectoryParser::setLicenseHeaderParser(LicenseParser parser)
0024 {
0025     m_parserType = parser;
0026 }
0027 
0028 QRegularExpression DirectoryParser::spdxStatementRegExp() const
0029 {
0030     static auto regexp = QRegularExpression("(SPDX-License-Identifier: (?<expression>(.*)))");
0031     return regexp;
0032 }
0033 
0034 QRegularExpression DirectoryParser::copyrightRegExp() const
0035 {
0036     static auto regexp = QRegularExpression(
0037         "(?<!\")" // negative lookahead for quotation marks, to skip string statements
0038         "(SPDX-FileCopyrightText:|[cC]opyright(\\s*:?\\s+\\([cC]\\))|(?<![cC]opyright )\\([cC]\\)|[cC]opyright\\s+©|(?<![cC]opyright )©|[cC]opyright(\\s*:)?)"
0039         "[, ]+"
0040         "(?<years>([0-9]+(-[0-9]+| - [0-9]+| to [0-9]+|,[ ]?[0-9]+)*|%{CURRENT_YEAR}))"
0041         "[, ]+"
0042         "([bB]y[ ]+)?"
0043         "(?<name>([\u00C0-\u017Fa-zA-Z\\-\\.]+( [\u00C0-\u017Fa-zA-Z\\-\\.]+)*|%{AUTHOR}))"
0044         "[, ]*"
0045         "(?<contact>.*|%{EMAIL})");
0046     return regexp;
0047 }
0048 
0049 QString DirectoryParser::cleanupSpaceInCopyrightYearList(const QString &originalYearText) const
0050 {
0051     QString cleanedYearText = originalYearText;
0052 
0053     static auto missingWhitespaceAfterCommaRegex = QRegularExpression(QStringLiteral(",(?=[0-9])"));
0054     static auto unneededWhitespaceAroundRangeRegex = QRegularExpression(QStringLiteral(" - (?=[0-9])"));
0055     static auto writtenRangeStatementRegex = QRegularExpression(QStringLiteral(" to (?=[0-9])"));
0056 
0057     cleanedYearText.replace(missingWhitespaceAfterCommaRegex, QStringLiteral(", "));
0058     cleanedYearText.replace(unneededWhitespaceAroundRangeRegex, QStringLiteral("-"));
0059     cleanedYearText.replace(writtenRangeStatementRegex, QStringLiteral("-"));
0060 
0061     return cleanedYearText;
0062 }
0063 
0064 QString DirectoryParser::unifyCopyrightStatements(const QString &originalText) const
0065 {
0066     QString header = originalText;
0067     QRegularExpression regExp = copyrightRegExp();
0068     auto match = regExp.match(header);
0069 
0070     while (match.hasMatch()) {
0071         QString years = match.captured("years");
0072         years = cleanupSpaceInCopyrightYearList(years);
0073         QString name = match.captured("name");
0074         QString contact = match.captured("contact");
0075 
0076         QString unifiedCopyright = QString("SPDX-FileCopyrightText: %1 %2 %3").arg(years).arg(name).arg(contact).trimmed();
0077         header.replace(match.capturedStart(), match.capturedLength(), unifiedCopyright);
0078         match = regExp.match(header, match.capturedStart() + unifiedCopyright.length());
0079     }
0080     return header;
0081 }
0082 
0083 QString DirectoryParser::unifyCopyrightCommentHeader(const QString &originalText) const
0084 {
0085     // restrict conversion to top-file comments
0086     if (!originalText.startsWith("/*")) {
0087         qWarning() << "\tFile not starting with a comment.";
0088         return originalText;
0089     }
0090     auto lines = originalText.split("\n");
0091     for (int i = 0; i < lines.size(); ++i) {
0092         lines[i].replace(QRegularExpression("/(\\*)+"), "/*"); // initial comment line
0093         if (lines[i].startsWith("/*")) {
0094             continue; // do not further modify first line
0095         }
0096         lines[i].replace(QRegularExpression("[ ]*(\\*)+/"), "*/"); // final comment line
0097         if (lines[i].startsWith("*/")) {
0098             break;
0099         }
0100         // invariant: the following line is guaranteed to be a port of multiline comment
0101         lines[i].replace(QRegularExpression("^[ \\*]+(?!(\\\\))"), "    "); // in-between line
0102         lines[i].replace(QRegularExpression("[ \\*]+$"), ""); // in-between line
0103     }
0104     QString text = lines.join("\n");
0105     //qDebug() << text;
0106     return text;
0107 }
0108 
0109 QString DirectoryParser::replaceHeaderText(const QString &fileContent, const QString &spdxExpression) const
0110 {
0111     auto regexps = m_registry.headerTextRegExps(spdxExpression);
0112     QString outputExpression = spdxExpression;
0113     outputExpression.replace('_', ' ');
0114     QString spdxOutputString = "SPDX-License-Identifier: " + outputExpression;
0115     QString newContent = fileContent;
0116 
0117     // replace by longest match
0118     QRegularExpression bestMatchingExpr = regexps.first();
0119     int bestCapturedLength = 0;
0120     for (auto regexp : regexps) {
0121         QRegularExpressionMatch match;
0122         if (newContent.contains(regexp, &match) && match.capturedLength() > bestCapturedLength) {
0123             bestMatchingExpr = regexp;
0124             bestCapturedLength = match.capturedLength();
0125         }
0126     }
0127     newContent.replace(bestMatchingExpr, spdxOutputString);
0128     return newContent;
0129 }
0130 
0131 LicenseRegistry::SpdxExpression DirectoryParser::detectSpdxLicenseStatement(const QString &fileContent) const
0132 {
0133     QRegularExpression regExp = spdxStatementRegExp();
0134     auto match = regExp.match(fileContent);
0135     if (match.hasMatch()) {
0136         // TODO this very simple solution only works for SPDX expressions in our database
0137         // should be made more general
0138         return match.captured("expression").replace(' ', '_');
0139     }
0140     return QString();
0141 }
0142 
0143 QVector<LicenseRegistry::SpdxExpression> DirectoryParser::pruneLicenseList(const QVector<LicenseRegistry::SpdxExpression> &inputLicenses) const
0144 {
0145     // TODO
0146     // - handle AND combinations
0147     // - handle complex OR combinations
0148     // - revisit operator preference order in SPDX and implement it here
0149 
0150     if (inputLicenses.length() == 1) {
0151         return inputLicenses;
0152     }
0153 
0154     auto licenses = inputLicenses;
0155     std::sort(licenses.begin(), licenses.end());
0156 
0157     // pruning step: remove duplicates
0158     licenses.erase(std::unique(licenses.begin(), licenses.end()), licenses.end());
0159 
0160     // pruning step: compute which licenses are supported with SPDX expression (splitting at "OR")
0161     // TODO this a very simple initial version and only works yet with simple license statements, not with multiple OR combinations
0162     QMap<LicenseRegistry::SpdxExpression, QVector<LicenseRegistry::SpdxExpression>> licenseClosure;
0163     for (const auto &license : qAsConst(licenses)) {
0164         QVector<LicenseRegistry::SpdxExpression> licenseChoice = license.split("_OR_").toVector();
0165 
0166         // remove all "WITH" statements
0167         for (int i = 0; i < licenseChoice.size(); ++i) {
0168             licenseChoice[i].remove(QRegularExpression("_WITH.*"));
0169         }
0170         licenseClosure[license] = licenseChoice;
0171     }
0172     QMutableVectorIterator<LicenseRegistry::SpdxExpression> iter(licenses);
0173     while (iter.hasNext()) {
0174         bool licensedContainedInClosure {false};
0175         LicenseRegistry::SpdxExpression expression = iter.next();
0176         for (auto iter = licenseClosure.begin(); iter != licenseClosure.end(); ++iter) {
0177             if (expression != iter.key() && iter.value().contains(expression)) {
0178                 licensedContainedInClosure = true;
0179             }
0180         }
0181         if (licensedContainedInClosure) {
0182             iter.remove();
0183         }
0184     }
0185     return licenses;
0186 }
0187 
0188 QVector<LicenseRegistry::SpdxExpression> DirectoryParser::detectLicenses(const QString &fileContent) const
0189 {
0190     switch (m_parserType) {
0191     case DirectoryParser::LicenseParser::REGEXP_PARSER:
0192         return detectLicensesRegexpParser(fileContent);
0193     case DirectoryParser::LicenseParser::SKIP_PARSER:
0194         return detectLicensesSkipParser(fileContent);
0195     }
0196     return {};
0197 }
0198 
0199 QVector<LicenseRegistry::SpdxExpression> DirectoryParser::detectLicensesSkipParser(const QString &fileContent) const
0200 {
0201     SkipParser parser;
0202     QVector<LicenseRegistry::SpdxExpression> testExpressions = m_registry.expressions();
0203     QVector<LicenseRegistry::SpdxExpression> detectedLicenses;
0204     for (auto expression : testExpressions) {
0205         auto match = parser.findMatch(fileContent, m_registry.headerTexts(expression));
0206         if (match) {
0207             detectedLicenses << expression;
0208         }
0209     }
0210     LicenseRegistry::SpdxExpression spdxStatement = detectSpdxLicenseStatement(fileContent);
0211     if (!spdxStatement.isEmpty()) {
0212         detectedLicenses << spdxStatement;
0213     }
0214     return detectedLicenses;
0215 }
0216 
0217 QVector<LicenseRegistry::SpdxExpression> DirectoryParser::detectLicensesRegexpParser(const QString &fileContent) const
0218 {
0219     QVector<LicenseRegistry::SpdxExpression> testExpressions = m_registry.expressions();
0220     QVector<LicenseRegistry::SpdxExpression> detectedLicenses;
0221     for (auto expression : testExpressions) {
0222         auto regexps = m_registry.headerTextRegExps(expression);
0223         for (auto regexp : regexps) {
0224             if (fileContent.contains(regexp)) {
0225                 detectedLicenses << expression;
0226             }
0227         }
0228     }
0229     LicenseRegistry::SpdxExpression spdxStatement = detectSpdxLicenseStatement(fileContent);
0230     if (!spdxStatement.isEmpty()) {
0231         detectedLicenses << spdxStatement;
0232     }
0233     return detectedLicenses;
0234 }
0235 
0236 QMap<QString, LicenseRegistry::SpdxExpression> DirectoryParser::parseAll(const QString &directory, bool convertMode, const QString &ignorePattern) const
0237 {
0238     QVector<LicenseRegistry::SpdxExpression> expressions = m_registry.expressions();
0239     QMap<QString, LicenseRegistry::SpdxExpression> results;
0240 
0241     if (convertMode) {
0242         qInfo() << "Running parser in CONVERT mode: every found license will be replaced with SPDX identifiers";
0243     }
0244 
0245     QStringList missingLicenseHeaderBlacklist;
0246     {
0247         QFile file(":/annotations/missing-headers-blacklist.txt");
0248         file.open(QIODevice::ReadOnly);
0249         QTextStream in(&file);
0250         QString line;
0251         while (in.readLineInto(&line)) {
0252             missingLicenseHeaderBlacklist.append(line);
0253         }
0254     }
0255     QStringList missingLicenseHeaderGeneratedFileBlacklist;
0256     {
0257         QFile file(":/annotations/generated-files.txt");
0258         file.open(QIODevice::ReadOnly);
0259         QTextStream in(&file);
0260         QString line;
0261         while (in.readLineInto(&line)) {
0262             missingLicenseHeaderGeneratedFileBlacklist.append(line);
0263         }
0264     }
0265 
0266     QRegularExpression ignoreFile(ignorePattern);
0267 
0268     QDirIterator iterator(directory, QDirIterator::Subdirectories);
0269     while (iterator.hasNext()) {
0270         QFile file(iterator.next());
0271         if (shallIgnoreFile(iterator, ignoreFile)) {
0272             continue;
0273         }
0274 
0275         bool skip = true;
0276         for (const auto &ending : DirectoryParser::s_supportedExtensions) {
0277             if (file.fileName().endsWith(ending)) {
0278                 skip = false;
0279                 break;
0280             }
0281         }
0282         if (skip) {
0283             continue;
0284         }
0285 
0286         file.open(QIODevice::ReadOnly);
0287         const QString fileContent = file.readAll();
0288         file.close();
0289 
0290         //        qDebug() << "checking:" << iterator.fileInfo();
0291         QVector<LicenseRegistry::SpdxExpression> licenses = detectLicenses(fileContent);
0292         licenses = pruneLicenseList(licenses);
0293 
0294         if (licenses.count() == 1) {
0295             results.insert(iterator.fileInfo().filePath(), licenses.first());
0296             //            qDebug() << "---> " << iterator.fileInfo().filePath() << identifier;
0297         } else if (licenses.count() > 1) {
0298             qCritical() << "UNHANDLED MULTI-LICENSE CASE" << iterator.fileInfo().filePath() << "-->" << licenses;
0299             results[iterator.fileInfo().filePath()] = LicenseRegistry::AmbigiousLicense;
0300         } else {
0301             // if nothing matches, report error
0302             results.insert(iterator.fileInfo().filePath(), LicenseRegistry::UnknownLicense);
0303 
0304             // check for blacklisted file because of missing license header only when no license was detected
0305             for (auto backlistPath : missingLicenseHeaderBlacklist) {
0306                 if (iterator.fileInfo().filePath().endsWith(backlistPath)) {
0307                     results.insert(iterator.fileInfo().filePath(), LicenseRegistry::MissingLicense);
0308                     break;
0309                 }
0310             }
0311             for (auto backlistPath : missingLicenseHeaderGeneratedFileBlacklist) {
0312                 if (iterator.fileInfo().filePath().endsWith(backlistPath)) {
0313                     results.insert(iterator.fileInfo().filePath(), LicenseRegistry::MissingLicenseForGeneratedFile);
0314                     break;
0315                 }
0316             }
0317         }
0318 
0319         const QString expression = results.value(iterator.fileInfo().filePath());
0320         if (convertMode && !m_registry.isFakeLicenseMarker(expression)) {
0321             QString newContent = replaceHeaderText(fileContent, expression);
0322             // qDebug() << newContent;
0323             file.open(QIODevice::WriteOnly);
0324             file.write(newContent.toUtf8());
0325             file.close();
0326         }
0327     }
0328 
0329     if (convertMode) {
0330         // compute needed licenses
0331         QSet<QString> identifiers;
0332         for (const auto &expression : results.values()) {
0333             auto expressionSplit = expression.split('_');
0334             for (const auto &identifier : expressionSplit) {
0335                 // remove SPDX syntax attributes
0336                 if (identifier == "OR" || identifier == "AND" || identifier == "WITH") {
0337                     continue;
0338                 }
0339                 // remove special placeholders
0340                 if (m_registry.isFakeLicenseMarker(identifier)) {
0341                     continue;
0342                 }
0343                 identifiers.insert(identifier);
0344             }
0345         }
0346         // create licenses directory and put license files therein
0347         QString licenseDir = directory + "/LICENSES/";
0348         QDir().mkdir(licenseDir);
0349         const auto licenseFiles = m_registry.licenseFiles();
0350         for (const auto &identifier : identifiers) {
0351             qDebug() << "Deploy license file" << identifier << licenseFiles.value(identifier);
0352             QFile::copy(licenseFiles.value(identifier), licenseDir + identifier + ".txt");
0353         }
0354     }
0355 
0356     return results;
0357 }
0358 
0359 void DirectoryParser::convertCopyright(const QString &directory, ConvertOptions options, const QString &ignorePattern) const
0360 {
0361     QRegularExpression ignoreFile(ignorePattern);
0362 
0363     QDirIterator iterator(directory, QDirIterator::Subdirectories);
0364     while (iterator.hasNext()) {
0365         QFile file(iterator.next());
0366 
0367         qInfo() << "Processing file:" << file.fileName();
0368 
0369         if (shallIgnoreFile(iterator, ignoreFile)) {
0370             qInfo() << "\tAsked to be ignored, skipping.";
0371             continue;
0372         }
0373         bool skip = true;
0374         for (const auto &ending : DirectoryParser::s_supportedExtensions) {
0375             if (file.fileName().endsWith(ending)) {
0376                 skip = false;
0377                 break;
0378             }
0379         }
0380         if (skip) {
0381             qInfo() << "\tUnsupported extension, skipping.";
0382             continue;
0383         }
0384 
0385         file.open(QIODevice::ReadOnly);
0386         QString content = file.readAll();
0387         file.close();
0388         if (options & ConvertOption::COPYRIGHT_TEXT) {
0389             content = unifyCopyrightStatements(content);
0390         }
0391         if (options & ConvertOption::PRETTY) {
0392             content = unifyCopyrightCommentHeader(content);
0393         }
0394         file.open(QIODevice::WriteOnly);
0395         file.write(content.toUtf8());
0396         file.close();
0397     }
0398 }