File indexing completed on 2024-05-19 05:42:01

0001 // ct_lvtclp_filesystemscanner.cpp                                   -*-C++-*-
0002 
0003 /*
0004 // Copyright 2023 Codethink Ltd <codethink@codethink.co.uk>
0005 // SPDX-License-Identifier: Apache-2.0
0006 //
0007 // Licensed under the Apache License, Version 2.0 (the "License");
0008 // you may not use this file except in compliance with the License.
0009 // You may obtain a copy of the License at
0010 //
0011 //     http://www.apache.org/licenses/LICENSE-2.0
0012 //
0013 // Unless required by applicable law or agreed to in writing, software
0014 // distributed under the License is distributed on an "AS IS" BASIS,
0015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0016 // See the License for the specific language governing permissions and
0017 // limitations under the License.
0018 */
0019 
0020 #include <ct_lvtclp_filesystemscanner.h>
0021 
0022 #include <ct_lvtmdb_componentobject.h>
0023 #include <ct_lvtmdb_errorobject.h>
0024 #include <ct_lvtmdb_fileobject.h>
0025 #include <ct_lvtmdb_objectstore.h>
0026 #include <ct_lvtmdb_packageobject.h>
0027 #include <ct_lvtmdb_repositoryobject.h>
0028 #include <ct_lvtmdb_util.h>
0029 
0030 #include <ct_lvtshr_stringhelpers.h>
0031 
0032 #include <ct_lvtclp_clputil.h>
0033 #include <ct_lvtclp_componentutil.h>
0034 #include <ct_lvtclp_fileutil.h>
0035 
0036 #include <QDebug>
0037 #include <QDir>
0038 
0039 #include <llvm/Support/FileSystem.h>
0040 
0041 #include <cassert>
0042 #include <unordered_map>
0043 #include <unordered_set>
0044 #include <utility>
0045 #include <vector>
0046 
0047 namespace {
0048 
0049 struct FoundPackage {
0050     std::string parent;
0051     std::string qualifiedName;
0052     std::string filePath;
0053     std::string repositoryName;
0054 };
0055 
0056 struct FoundThing {
0057     // something with a qualified name and a parent (referred to by qualified name)
0058     std::string parent;
0059     std::string qualifiedName;
0060     std::string filePath;
0061 };
0062 
0063 struct FoundThingHash {
0064     std::size_t operator()(FoundThing const& thing) const
0065     {
0066         return std::hash<std::string>{}(thing.parent + thing.qualifiedName);
0067     }
0068 };
0069 
0070 } // namespace
0071 
0072 namespace Codethink::lvtclp {
0073 
0074 struct FilesystemScanner::Private {
0075     lvtmdb::ObjectStore& memDb;
0076     // Memory database
0077     std::filesystem::path prefix;
0078     // common prefix of paths
0079     const LvtCompilationDatabase& cdb;
0080     // compilation database used so that we don't add things deliberately
0081     // configured out of the build
0082     std::vector<std::filesystem::path> nonLakosianDirs;
0083 
0084     std::vector<FoundThing> foundFiles;
0085 
0086     // from comparing the files we found to the files in the database we can
0087     // figure out which files are new and which are deleted
0088     std::vector<FoundPackage> foundPkgs;
0089     std::unordered_set<std::string> foundPkgNames;
0090     std::unordered_map<std::string, PackageHelper> foundPkgGrps;
0091     std::vector<RepositoryHelper> foundRepositories;
0092 
0093     // compare with the set of packages in the database to see which packages
0094     // were added and removed
0095     // Package groups and packages are separated so we can add all of the
0096     // package groups first and then be sure we have all the parents in place
0097     // when adding packages
0098 
0099     std::function<void(const std::string&, long)> messageCallback;
0100     bool catchCodeAnalysisOutput;
0101 
0102     std::vector<llvm::GlobPattern> ignoreGlobs;
0103 
0104     explicit Private(lvtmdb::ObjectStore& memDb,
0105                      std::filesystem::path prefix,
0106                      const LvtCompilationDatabase& cdb,
0107                      std::function<void(const std::string, long)> messageCallback,
0108                      std::vector<std::filesystem::path> nonLakosianDirs,
0109                      bool catchCodeAnalysisOutput,
0110                      std::vector<llvm::GlobPattern> ignoreGlobs):
0111         memDb(memDb),
0112         prefix(std::move(prefix)),
0113         cdb(cdb),
0114         nonLakosianDirs(std::move(nonLakosianDirs)),
0115         messageCallback(std::move(messageCallback)),
0116         catchCodeAnalysisOutput(catchCodeAnalysisOutput),
0117         ignoreGlobs(std::move(ignoreGlobs))
0118     {
0119     }
0120 };
0121 
0122 FilesystemScanner::FilesystemScanner(lvtmdb::ObjectStore& memDb,
0123                                      const std::filesystem::path& prefix,
0124                                      const LvtCompilationDatabase& cdb,
0125                                      std::function<void(const std::string&, long)> messageCallback,
0126                                      bool catchCodeAnalysisOutput,
0127                                      std::vector<std::filesystem::path> nonLakosianDirs,
0128                                      std::vector<llvm::GlobPattern> ignoreGlobs):
0129     d(std::make_unique<FilesystemScanner::Private>(memDb,
0130                                                    prefix,
0131                                                    cdb,
0132                                                    std::move(messageCallback),
0133                                                    std::move(nonLakosianDirs),
0134                                                    catchCodeAnalysisOutput,
0135                                                    std::move(ignoreGlobs)))
0136 {
0137 }
0138 
0139 FilesystemScanner::~FilesystemScanner() noexcept = default;
0140 
0141 FilesystemScanner::IncrementalResult FilesystemScanner::scanCompilationDb()
0142 {
0143     for (const std::string& string : d->cdb.getAllFiles()) {
0144         const std::filesystem::path path(string);
0145         scanPath(path);
0146         scanHeader(path);
0147     }
0148 
0149     return addToDatabase();
0150 }
0151 
0152 void FilesystemScanner::scanHeader(const std::filesystem::path& path)
0153 {
0154     // the compilation database only contains .cpp files
0155     // but we need to discover changes to headers as well so we can calculate,
0156     // the incrmental update to-do list before doing a real physical scan (which
0157     // actually processes the #include directives in .cpp files). To always be
0158     // correct for non-lakosian code we would have to do a full (non-incremental)
0159     // physical scan to find all headers, then use all of these to calculate the
0160     // incremental update todo list. That would dramatically slow down tool usage
0161     // on lakosian code. Instead we will try to guess header paths here without
0162     // looking at #include directives.
0163     //
0164     // For lakosian style code, headers will be in the same directory as the
0165     // .cpp file, but lots of other code will put headers in another include
0166     // directory. We can discover include directories from the compiler arguments
0167     // in the compilation database.
0168     //
0169     // We will assume that the header file has the same name as the .cpp file
0170     // but with a header extension. Most non-lakosian code will follow this
0171     // convention anyway, and using components is the bare minimum of lakosian
0172     // design.
0173     static const std::vector<std::string> headerExtensions({".h", ".hh", ".h++", ".hpp"});
0174 
0175     const std::filesystem::path parent = std::filesystem::weakly_canonical(path.parent_path());
0176     const std::filesystem::path stem = path.stem();
0177 
0178     const std::vector<clang::tooling::CompileCommand> compileCommands = d->cdb.getCompileCommands(path.string());
0179 
0180     std::vector<std::filesystem::path> includeDirectories;
0181     includeDirectories.emplace_back(parent); // for lakosian code
0182 
0183     // collect include directories for this file
0184     for (const clang::tooling::CompileCommand& cmd : compileCommands) {
0185         for (const std::string& arg : cmd.CommandLine) {
0186             // check for arguments like -Isome/path
0187             if (arg.size() > 2 && arg[0] == '-' && arg[1] == 'I') {
0188                 // remove the -I
0189                 const std::string includeDirStr = arg.substr(2);
0190 
0191                 std::filesystem::path includeDir(includeDirStr);
0192                 if (includeDir.is_relative()) {
0193                     // the include path is relative to the source file e.g.
0194                     // -I../../groups/foo/foobar
0195                     includeDir = parent / includeDir;
0196                 }
0197 
0198                 if (std::filesystem::is_directory(includeDir)) {
0199                     includeDir = std::filesystem::canonical(includeDir);
0200 
0201                     auto it = std::find(includeDirectories.begin(), includeDirectories.end(), includeDir);
0202                     if (it == includeDirectories.end()) {
0203                         includeDirectories.emplace_back(std::move(includeDir));
0204                     }
0205                 }
0206             }
0207         }
0208     }
0209 
0210     // look in the include directories for a matching header
0211     for (const std::filesystem::path& includeDir : includeDirectories) {
0212         for (const std::string& ext : headerExtensions) {
0213             std::filesystem::path headerPath = (includeDir / stem).concat(ext);
0214             if (std::filesystem::exists(headerPath)) {
0215                 // we found the header!
0216                 scanPath(headerPath);
0217                 return;
0218             }
0219         }
0220     }
0221 }
0222 
0223 void FilesystemScanner::scanPath(const std::filesystem::path& path)
0224 {
0225     if (!std::filesystem::is_regular_file(path)) {
0226         return;
0227     }
0228 
0229     if (ClpUtil::isFileIgnored(path.filename().string(), d->ignoreGlobs)) {
0230         return;
0231     }
0232 
0233     auto addPkg = [&](std::string const& qualifiedName,
0234                       std::optional<std::string> parentQualifiedName = std::nullopt,
0235                       std::optional<std::string> repositoryName = std::nullopt,
0236                       std::optional<std::string> path = std::nullopt) {
0237         if (d->foundPkgNames.count(qualifiedName) > 0) {
0238             return;
0239         }
0240 
0241         if (repositoryName) {
0242             d->foundRepositories.emplace_back(RepositoryHelper{*repositoryName, ""});
0243         }
0244         if (parentQualifiedName) {
0245             if (d->foundPkgNames.count(*parentQualifiedName) == 0) {
0246                 d->foundPkgs.emplace_back(
0247                     FoundPackage{"", *parentQualifiedName, "", repositoryName ? *repositoryName : ""});
0248                 d->foundPkgNames.insert(*parentQualifiedName);
0249             }
0250         }
0251         d->foundPkgs.emplace_back(FoundPackage{parentQualifiedName ? *parentQualifiedName : "",
0252                                                qualifiedName,
0253                                                path ? *path : "",
0254                                                repositoryName ? *repositoryName : ""});
0255         d->foundPkgNames.insert(qualifiedName);
0256     };
0257     auto filePathQString = QString::fromStdString(path.string());
0258     auto fullFilePath = QDir::fromNativeSeparators(filePathQString).toStdString();
0259     for (auto&& semanticPackingRule : ClpUtil::getAllSemanticPackingRules()) {
0260         if (semanticPackingRule->accept(fullFilePath)) {
0261             auto pkg = semanticPackingRule->process(fullFilePath, addPkg);
0262             addSourceFile(path, pkg);
0263             return;
0264         }
0265     }
0266 
0267     if (ClpUtil::isComponentOnStandalonePackage(path)) {
0268         const auto pkgPath = path.parent_path();
0269         const auto pkg = addSourcePackage(pkgPath, "", true);
0270         addSourceFile((pkgPath / path.filename()).string(), pkg);
0271     } else if (ClpUtil::isComponentOnPackageGroup(path)) {
0272         const auto pkgPath = path.parent_path();
0273         const auto pkgGrpPath = pkgPath.parent_path();
0274         const auto pkgGrp = addSourcePackage(pkgGrpPath, "", false);
0275         const auto pkg = addSourcePackage(pkgPath, pkgGrp, false);
0276         addSourceFile((pkgPath / path.filename()).string(), pkg);
0277     } else {
0278         const static std::string nonLakosianGroup(ClpUtil::NON_LAKOSIAN_GROUP_NAME);
0279         if (!d->foundPkgGrps.count(nonLakosianGroup)) {
0280             d->foundPkgGrps[nonLakosianGroup] = PackageHelper{"", nonLakosianGroup, std::string{}};
0281         }
0282         const auto pkgPath = path.parent_path();
0283         const auto pkg = addSourcePackage(pkgPath, ClpUtil::NON_LAKOSIAN_GROUP_NAME, false);
0284         if (!pkg.empty()) {
0285             addSourceFile(path, pkg);
0286         } else {
0287             addSourceFile(path, nonLakosianGroup);
0288         }
0289     }
0290 }
0291 
0292 void FilesystemScanner::addSourceFile(const std::filesystem::path& path, const std::string& package)
0293 {
0294     d->foundFiles.push_back({package, path.string(), std::string{}});
0295 }
0296 
0297 std::string
0298 FilesystemScanner::addSourcePackage(const std::filesystem::path& path, const std::string& inParent, bool isStandalone)
0299 {
0300     std::string parent = inParent;
0301 
0302     if (!std::filesystem::is_directory(path)) {
0303         return {};
0304     }
0305 
0306     const std::filesystem::path normalisedPath = ClpUtil::normalisePath(path, d->prefix);
0307     if (normalisedPath.empty()) {
0308         return {};
0309     }
0310 
0311     std::filesystem::path fullPath;
0312     if (normalisedPath.is_relative()) {
0313         fullPath = d->prefix / normalisedPath;
0314     } else {
0315         fullPath = normalisedPath;
0316     }
0317 
0318     // check if this path is explicitly flagged as non-lakosian
0319     for (const std::filesystem::path& nonLakosianDir : d->nonLakosianDirs) {
0320         if (FileUtil::pathStartsWith(nonLakosianDir, fullPath)) {
0321             const static std::string nonLakosianGroup(ClpUtil::NON_LAKOSIAN_GROUP_NAME);
0322             d->foundPkgGrps[nonLakosianGroup] = PackageHelper{"", nonLakosianGroup, std::string{}};
0323             parent = nonLakosianGroup;
0324             break;
0325         }
0326     }
0327 
0328     std::string qualifiedName = normalisedPath.string();
0329     if (!isStandalone && parent.empty()) {
0330         if (!d->foundPkgNames.count(qualifiedName)) {
0331             auto filePath = QString::fromStdString(path.string());
0332             auto projectSource = QString::fromStdString(d->prefix.string());
0333             if (filePath.startsWith(projectSource)) {
0334                 filePath.replace(projectSource, "${SOURCE_DIR}/");
0335             }
0336             d->foundPkgGrps[qualifiedName] = PackageHelper{"", qualifiedName, filePath.toStdString()};
0337         }
0338     } else {
0339         if (d->foundPkgGrps.count(qualifiedName)) {
0340             // we already added this without a parent. Get rid of that because
0341             // we now have a parent
0342             d->foundPkgGrps.erase(qualifiedName);
0343         }
0344         auto filePath = QString::fromStdString(path.string());
0345         auto projectSource = QString::fromStdString(d->prefix.string());
0346         if (filePath.startsWith(projectSource)) {
0347             filePath.replace(projectSource, "${SOURCE_DIR}/");
0348         }
0349         d->foundPkgs.emplace_back(FoundPackage{parent, qualifiedName, filePath.toStdString(), ""});
0350         d->foundPkgNames.insert(std::move(qualifiedName));
0351     }
0352 
0353     return normalisedPath.string();
0354 }
0355 
0356 lvtmdb::PackageObject *FilesystemScanner::addPackage(IncrementalResult& out,
0357                                                      std::unordered_set<lvtmdb::PackageObject *>& existingPkgs,
0358                                                      const std::string& qualifiedName,
0359                                                      const std::string& parentName,
0360                                                      const std::string& filePath,
0361                                                      const std::string& repositoryName)
0362 {
0363     lvtmdb::PackageObject *pkg = d->memDb.getPackage(qualifiedName);
0364     if (pkg) {
0365         existingPkgs.insert(pkg);
0366         return pkg;
0367     }
0368 
0369     const std::filesystem::path path(qualifiedName);
0370     std::string name = path.filename().string();
0371 
0372     lvtmdb::PackageObject *parent = nullptr;
0373     if (!parentName.empty()) {
0374         // we can't recurse in the common case, otherwise existingPkgs
0375         // would get false positives
0376         parent = d->memDb.getPackage(parentName);
0377         if (!parent) {
0378             // assumes parent has no parent (it is a package group)
0379             parent = addPackage(out, existingPkgs, parentName, std::string{}, filePath, repositoryName);
0380         }
0381         assert(parent);
0382     }
0383 
0384     lvtmdb::RepositoryObject *repo = d->memDb.getOrAddRepository(repositoryName, "");
0385 
0386     out.newPkgs.push_back(qualifiedName);
0387     lvtmdb::PackageObject *thisPkg = d->memDb.getOrAddPackage(qualifiedName, std::move(name), filePath, parent, repo);
0388 
0389     if (repo) {
0390         repo->withRWLock([&] {
0391             repo->addChild(thisPkg);
0392         });
0393     }
0394 
0395     if (parent) {
0396         parent->withRWLock([&] {
0397             parent->addChild(thisPkg);
0398         });
0399     }
0400 
0401     return thisPkg;
0402 }
0403 
0404 FilesystemScanner::IncrementalResult FilesystemScanner::addToDatabase()
0405 {
0406     auto lock = d->memDb.rwLock();
0407 
0408     IncrementalResult out;
0409 
0410     // track what already existed so we can find deleted things
0411     const std::vector<lvtmdb::PackageObject *> allDbPkgs = d->memDb.getAllPackages();
0412     const std::vector<lvtmdb::FileObject *> allDbFiles = d->memDb.getAllFiles();
0413     std::unordered_set<lvtmdb::PackageObject *> existingPkgs;
0414     std::unordered_set<lvtmdb::FileObject *> existingFiles;
0415 
0416     // add repositories
0417     for (const auto& helper : d->foundRepositories) {
0418         (void) d->memDb.getOrAddRepository(helper.qualifiedName, helper.path);
0419     }
0420 
0421     // add package groups
0422     for (const auto& [name, helper] : d->foundPkgGrps) {
0423         addPackage(out, existingPkgs, name, std::string{}, helper.filePath, helper.parentRepositoryName);
0424     }
0425 
0426     // add packages
0427     for (auto&& pkg : d->foundPkgs) {
0428         addPackage(out, existingPkgs, pkg.qualifiedName, pkg.parent, pkg.filePath, pkg.repositoryName);
0429     }
0430 
0431     // add files
0432     for (const FoundThing& file : d->foundFiles) {
0433         lvtmdb::PackageObject *parent = d->memDb.getPackage(file.parent);
0434         assert(parent || file.parent.empty());
0435 
0436         const std::filesystem::path path = ClpUtil::normalisePath(file.qualifiedName, d->prefix).string();
0437 
0438         std::filesystem::path fullPath = d->prefix / path;
0439         auto hash = [&fullPath]() -> std::string {
0440             auto result = llvm::sys::fs::md5_contents(fullPath.string());
0441             if (result) {
0442                 return result.get().digest().str().str();
0443             }
0444 
0445             // allow failure to hash file contents because we use memory mapped files in tests
0446             return "";
0447         }();
0448 
0449         lvtmdb::FileObject *filePtr = d->memDb.getFile(path.string());
0450         if (!filePtr) {
0451             const FileType type = ClpUtil::categorisePath(path.string());
0452             bool isHeader;
0453             if (type == FileType::e_Header) {
0454                 isHeader = true;
0455             } else if (type == FileType::e_Source) {
0456                 isHeader = false;
0457             } else if (type == FileType::e_KnownUnknown) {
0458                 continue;
0459             } else { // type == FileType::e_UnknownUnknown
0460                 d->memDb.getOrAddError(lvtmdb::MdbUtil::ErrorKind::ParserError,
0461                                        "",
0462                                        "Unknown file extension",
0463                                        path.string());
0464                 continue;
0465             }
0466 
0467             // create or fetch the component for this file
0468             lvtmdb::ComponentObject *comp = ComponentUtil::addComponent(path, parent, d->memDb);
0469 
0470             filePtr =
0471                 d->memDb.getOrAddFile(path.string(), path.filename().string(), isHeader, std::move(hash), parent, comp);
0472             out.newFiles.push_back(path.string());
0473             comp->withRWLock([&] {
0474                 comp->addFile(filePtr);
0475             });
0476             parent->withRWLock([&] {
0477                 parent->addComponent(comp);
0478             });
0479 
0480         } else {
0481             existingFiles.insert(filePtr);
0482 
0483             // allowing !inCdb when it ended up in the database anyway
0484             // (for example via an #include)
0485 
0486             filePtr->withRWLock([&] {
0487                 if (hash != filePtr->hash()) {
0488                     if (d->catchCodeAnalysisOutput) {
0489                         qDebug() << "Found modified file " << path.string();
0490                     }
0491                     out.modifiedFiles.push_back(filePtr->qualifiedName());
0492                     filePtr->setHash(std::move(hash));
0493                 }
0494             });
0495         }
0496     }
0497 
0498     // which packages and package groups were deleted?
0499     for (lvtmdb::PackageObject *pkg : allDbPkgs) {
0500         std::string qualifiedName;
0501         pkg->withROLock([&] {
0502             qualifiedName = pkg->qualifiedName();
0503         });
0504         if (!d->cdb.containsPackage(qualifiedName)) {
0505             // that package isn't in the code database so we will never find it
0506             // in the scan anyway
0507             continue;
0508         }
0509         const auto it = existingPkgs.find(pkg);
0510         if (it == existingPkgs.end()) {
0511             if (d->catchCodeAnalysisOutput) {
0512                 qDebug() << "Package deleted: " << qualifiedName;
0513             }
0514             out.deletedPkgs.push_back(qualifiedName);
0515         }
0516     }
0517 
0518     // which files were deleted?
0519     for (lvtmdb::FileObject *file : allDbFiles) {
0520         std::string qualifiedName;
0521         file->withROLock([&] {
0522             qualifiedName = file->qualifiedName();
0523         });
0524         if (!d->cdb.containsFile(qualifiedName)) {
0525             // we never find system paths in existing files because this
0526             // filesystem visitor does not traverse them (causing us to always
0527             // think system files have been deleted)
0528             continue;
0529         }
0530 
0531         const auto it = existingFiles.find(file);
0532         if (it == existingFiles.end()) {
0533             file->withROLock([&] {
0534                 if (d->catchCodeAnalysisOutput) {
0535                     qDebug() << "File deleted: " << file->qualifiedName();
0536                 }
0537                 out.deletedFiles.push_back(file->qualifiedName());
0538             });
0539         }
0540     }
0541 
0542     // we've processed everything we added. Clear everything so we are ready for
0543     // the next scan
0544     d->foundFiles.clear();
0545     d->foundPkgs.clear();
0546     d->foundPkgGrps.clear();
0547 
0548     return out;
0549 }
0550 
0551 } // namespace Codethink::lvtclp