File indexing completed on 2024-05-19 05:42:01
0001 // ct_lvtclp_filesystemscanner.cpp -*-C++-*- 0002 0003 /* 0004 // Copyright 2023 Codethink Ltd <codethink@codethink.co.uk> 0005 // SPDX-License-Identifier: Apache-2.0 0006 // 0007 // Licensed under the Apache License, Version 2.0 (the "License"); 0008 // you may not use this file except in compliance with the License. 0009 // You may obtain a copy of the License at 0010 // 0011 // http://www.apache.org/licenses/LICENSE-2.0 0012 // 0013 // Unless required by applicable law or agreed to in writing, software 0014 // distributed under the License is distributed on an "AS IS" BASIS, 0015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 0016 // See the License for the specific language governing permissions and 0017 // limitations under the License. 0018 */ 0019 0020 #include <ct_lvtclp_filesystemscanner.h> 0021 0022 #include <ct_lvtmdb_componentobject.h> 0023 #include <ct_lvtmdb_errorobject.h> 0024 #include <ct_lvtmdb_fileobject.h> 0025 #include <ct_lvtmdb_objectstore.h> 0026 #include <ct_lvtmdb_packageobject.h> 0027 #include <ct_lvtmdb_repositoryobject.h> 0028 #include <ct_lvtmdb_util.h> 0029 0030 #include <ct_lvtshr_stringhelpers.h> 0031 0032 #include <ct_lvtclp_clputil.h> 0033 #include <ct_lvtclp_componentutil.h> 0034 #include <ct_lvtclp_fileutil.h> 0035 0036 #include <QDebug> 0037 #include <QDir> 0038 0039 #include <llvm/Support/FileSystem.h> 0040 0041 #include <cassert> 0042 #include <unordered_map> 0043 #include <unordered_set> 0044 #include <utility> 0045 #include <vector> 0046 0047 namespace { 0048 0049 struct FoundPackage { 0050 std::string parent; 0051 std::string qualifiedName; 0052 std::string filePath; 0053 std::string repositoryName; 0054 }; 0055 0056 struct FoundThing { 0057 // something with a qualified name and a parent (referred to by qualified name) 0058 std::string parent; 0059 std::string qualifiedName; 0060 std::string filePath; 0061 }; 0062 0063 struct FoundThingHash { 0064 std::size_t operator()(FoundThing const& thing) const 0065 { 0066 return std::hash<std::string>{}(thing.parent + thing.qualifiedName); 0067 } 0068 }; 0069 0070 } // namespace 0071 0072 namespace Codethink::lvtclp { 0073 0074 struct FilesystemScanner::Private { 0075 lvtmdb::ObjectStore& memDb; 0076 // Memory database 0077 std::filesystem::path prefix; 0078 // common prefix of paths 0079 const LvtCompilationDatabase& cdb; 0080 // compilation database used so that we don't add things deliberately 0081 // configured out of the build 0082 std::vector<std::filesystem::path> nonLakosianDirs; 0083 0084 std::vector<FoundThing> foundFiles; 0085 0086 // from comparing the files we found to the files in the database we can 0087 // figure out which files are new and which are deleted 0088 std::vector<FoundPackage> foundPkgs; 0089 std::unordered_set<std::string> foundPkgNames; 0090 std::unordered_map<std::string, PackageHelper> foundPkgGrps; 0091 std::vector<RepositoryHelper> foundRepositories; 0092 0093 // compare with the set of packages in the database to see which packages 0094 // were added and removed 0095 // Package groups and packages are separated so we can add all of the 0096 // package groups first and then be sure we have all the parents in place 0097 // when adding packages 0098 0099 std::function<void(const std::string&, long)> messageCallback; 0100 bool catchCodeAnalysisOutput; 0101 0102 std::vector<llvm::GlobPattern> ignoreGlobs; 0103 0104 explicit Private(lvtmdb::ObjectStore& memDb, 0105 std::filesystem::path prefix, 0106 const LvtCompilationDatabase& cdb, 0107 std::function<void(const std::string, long)> messageCallback, 0108 std::vector<std::filesystem::path> nonLakosianDirs, 0109 bool catchCodeAnalysisOutput, 0110 std::vector<llvm::GlobPattern> ignoreGlobs): 0111 memDb(memDb), 0112 prefix(std::move(prefix)), 0113 cdb(cdb), 0114 nonLakosianDirs(std::move(nonLakosianDirs)), 0115 messageCallback(std::move(messageCallback)), 0116 catchCodeAnalysisOutput(catchCodeAnalysisOutput), 0117 ignoreGlobs(std::move(ignoreGlobs)) 0118 { 0119 } 0120 }; 0121 0122 FilesystemScanner::FilesystemScanner(lvtmdb::ObjectStore& memDb, 0123 const std::filesystem::path& prefix, 0124 const LvtCompilationDatabase& cdb, 0125 std::function<void(const std::string&, long)> messageCallback, 0126 bool catchCodeAnalysisOutput, 0127 std::vector<std::filesystem::path> nonLakosianDirs, 0128 std::vector<llvm::GlobPattern> ignoreGlobs): 0129 d(std::make_unique<FilesystemScanner::Private>(memDb, 0130 prefix, 0131 cdb, 0132 std::move(messageCallback), 0133 std::move(nonLakosianDirs), 0134 catchCodeAnalysisOutput, 0135 std::move(ignoreGlobs))) 0136 { 0137 } 0138 0139 FilesystemScanner::~FilesystemScanner() noexcept = default; 0140 0141 FilesystemScanner::IncrementalResult FilesystemScanner::scanCompilationDb() 0142 { 0143 for (const std::string& string : d->cdb.getAllFiles()) { 0144 const std::filesystem::path path(string); 0145 scanPath(path); 0146 scanHeader(path); 0147 } 0148 0149 return addToDatabase(); 0150 } 0151 0152 void FilesystemScanner::scanHeader(const std::filesystem::path& path) 0153 { 0154 // the compilation database only contains .cpp files 0155 // but we need to discover changes to headers as well so we can calculate, 0156 // the incrmental update to-do list before doing a real physical scan (which 0157 // actually processes the #include directives in .cpp files). To always be 0158 // correct for non-lakosian code we would have to do a full (non-incremental) 0159 // physical scan to find all headers, then use all of these to calculate the 0160 // incremental update todo list. That would dramatically slow down tool usage 0161 // on lakosian code. Instead we will try to guess header paths here without 0162 // looking at #include directives. 0163 // 0164 // For lakosian style code, headers will be in the same directory as the 0165 // .cpp file, but lots of other code will put headers in another include 0166 // directory. We can discover include directories from the compiler arguments 0167 // in the compilation database. 0168 // 0169 // We will assume that the header file has the same name as the .cpp file 0170 // but with a header extension. Most non-lakosian code will follow this 0171 // convention anyway, and using components is the bare minimum of lakosian 0172 // design. 0173 static const std::vector<std::string> headerExtensions({".h", ".hh", ".h++", ".hpp"}); 0174 0175 const std::filesystem::path parent = std::filesystem::weakly_canonical(path.parent_path()); 0176 const std::filesystem::path stem = path.stem(); 0177 0178 const std::vector<clang::tooling::CompileCommand> compileCommands = d->cdb.getCompileCommands(path.string()); 0179 0180 std::vector<std::filesystem::path> includeDirectories; 0181 includeDirectories.emplace_back(parent); // for lakosian code 0182 0183 // collect include directories for this file 0184 for (const clang::tooling::CompileCommand& cmd : compileCommands) { 0185 for (const std::string& arg : cmd.CommandLine) { 0186 // check for arguments like -Isome/path 0187 if (arg.size() > 2 && arg[0] == '-' && arg[1] == 'I') { 0188 // remove the -I 0189 const std::string includeDirStr = arg.substr(2); 0190 0191 std::filesystem::path includeDir(includeDirStr); 0192 if (includeDir.is_relative()) { 0193 // the include path is relative to the source file e.g. 0194 // -I../../groups/foo/foobar 0195 includeDir = parent / includeDir; 0196 } 0197 0198 if (std::filesystem::is_directory(includeDir)) { 0199 includeDir = std::filesystem::canonical(includeDir); 0200 0201 auto it = std::find(includeDirectories.begin(), includeDirectories.end(), includeDir); 0202 if (it == includeDirectories.end()) { 0203 includeDirectories.emplace_back(std::move(includeDir)); 0204 } 0205 } 0206 } 0207 } 0208 } 0209 0210 // look in the include directories for a matching header 0211 for (const std::filesystem::path& includeDir : includeDirectories) { 0212 for (const std::string& ext : headerExtensions) { 0213 std::filesystem::path headerPath = (includeDir / stem).concat(ext); 0214 if (std::filesystem::exists(headerPath)) { 0215 // we found the header! 0216 scanPath(headerPath); 0217 return; 0218 } 0219 } 0220 } 0221 } 0222 0223 void FilesystemScanner::scanPath(const std::filesystem::path& path) 0224 { 0225 if (!std::filesystem::is_regular_file(path)) { 0226 return; 0227 } 0228 0229 if (ClpUtil::isFileIgnored(path.filename().string(), d->ignoreGlobs)) { 0230 return; 0231 } 0232 0233 auto addPkg = [&](std::string const& qualifiedName, 0234 std::optional<std::string> parentQualifiedName = std::nullopt, 0235 std::optional<std::string> repositoryName = std::nullopt, 0236 std::optional<std::string> path = std::nullopt) { 0237 if (d->foundPkgNames.count(qualifiedName) > 0) { 0238 return; 0239 } 0240 0241 if (repositoryName) { 0242 d->foundRepositories.emplace_back(RepositoryHelper{*repositoryName, ""}); 0243 } 0244 if (parentQualifiedName) { 0245 if (d->foundPkgNames.count(*parentQualifiedName) == 0) { 0246 d->foundPkgs.emplace_back( 0247 FoundPackage{"", *parentQualifiedName, "", repositoryName ? *repositoryName : ""}); 0248 d->foundPkgNames.insert(*parentQualifiedName); 0249 } 0250 } 0251 d->foundPkgs.emplace_back(FoundPackage{parentQualifiedName ? *parentQualifiedName : "", 0252 qualifiedName, 0253 path ? *path : "", 0254 repositoryName ? *repositoryName : ""}); 0255 d->foundPkgNames.insert(qualifiedName); 0256 }; 0257 auto filePathQString = QString::fromStdString(path.string()); 0258 auto fullFilePath = QDir::fromNativeSeparators(filePathQString).toStdString(); 0259 for (auto&& semanticPackingRule : ClpUtil::getAllSemanticPackingRules()) { 0260 if (semanticPackingRule->accept(fullFilePath)) { 0261 auto pkg = semanticPackingRule->process(fullFilePath, addPkg); 0262 addSourceFile(path, pkg); 0263 return; 0264 } 0265 } 0266 0267 if (ClpUtil::isComponentOnStandalonePackage(path)) { 0268 const auto pkgPath = path.parent_path(); 0269 const auto pkg = addSourcePackage(pkgPath, "", true); 0270 addSourceFile((pkgPath / path.filename()).string(), pkg); 0271 } else if (ClpUtil::isComponentOnPackageGroup(path)) { 0272 const auto pkgPath = path.parent_path(); 0273 const auto pkgGrpPath = pkgPath.parent_path(); 0274 const auto pkgGrp = addSourcePackage(pkgGrpPath, "", false); 0275 const auto pkg = addSourcePackage(pkgPath, pkgGrp, false); 0276 addSourceFile((pkgPath / path.filename()).string(), pkg); 0277 } else { 0278 const static std::string nonLakosianGroup(ClpUtil::NON_LAKOSIAN_GROUP_NAME); 0279 if (!d->foundPkgGrps.count(nonLakosianGroup)) { 0280 d->foundPkgGrps[nonLakosianGroup] = PackageHelper{"", nonLakosianGroup, std::string{}}; 0281 } 0282 const auto pkgPath = path.parent_path(); 0283 const auto pkg = addSourcePackage(pkgPath, ClpUtil::NON_LAKOSIAN_GROUP_NAME, false); 0284 if (!pkg.empty()) { 0285 addSourceFile(path, pkg); 0286 } else { 0287 addSourceFile(path, nonLakosianGroup); 0288 } 0289 } 0290 } 0291 0292 void FilesystemScanner::addSourceFile(const std::filesystem::path& path, const std::string& package) 0293 { 0294 d->foundFiles.push_back({package, path.string(), std::string{}}); 0295 } 0296 0297 std::string 0298 FilesystemScanner::addSourcePackage(const std::filesystem::path& path, const std::string& inParent, bool isStandalone) 0299 { 0300 std::string parent = inParent; 0301 0302 if (!std::filesystem::is_directory(path)) { 0303 return {}; 0304 } 0305 0306 const std::filesystem::path normalisedPath = ClpUtil::normalisePath(path, d->prefix); 0307 if (normalisedPath.empty()) { 0308 return {}; 0309 } 0310 0311 std::filesystem::path fullPath; 0312 if (normalisedPath.is_relative()) { 0313 fullPath = d->prefix / normalisedPath; 0314 } else { 0315 fullPath = normalisedPath; 0316 } 0317 0318 // check if this path is explicitly flagged as non-lakosian 0319 for (const std::filesystem::path& nonLakosianDir : d->nonLakosianDirs) { 0320 if (FileUtil::pathStartsWith(nonLakosianDir, fullPath)) { 0321 const static std::string nonLakosianGroup(ClpUtil::NON_LAKOSIAN_GROUP_NAME); 0322 d->foundPkgGrps[nonLakosianGroup] = PackageHelper{"", nonLakosianGroup, std::string{}}; 0323 parent = nonLakosianGroup; 0324 break; 0325 } 0326 } 0327 0328 std::string qualifiedName = normalisedPath.string(); 0329 if (!isStandalone && parent.empty()) { 0330 if (!d->foundPkgNames.count(qualifiedName)) { 0331 auto filePath = QString::fromStdString(path.string()); 0332 auto projectSource = QString::fromStdString(d->prefix.string()); 0333 if (filePath.startsWith(projectSource)) { 0334 filePath.replace(projectSource, "${SOURCE_DIR}/"); 0335 } 0336 d->foundPkgGrps[qualifiedName] = PackageHelper{"", qualifiedName, filePath.toStdString()}; 0337 } 0338 } else { 0339 if (d->foundPkgGrps.count(qualifiedName)) { 0340 // we already added this without a parent. Get rid of that because 0341 // we now have a parent 0342 d->foundPkgGrps.erase(qualifiedName); 0343 } 0344 auto filePath = QString::fromStdString(path.string()); 0345 auto projectSource = QString::fromStdString(d->prefix.string()); 0346 if (filePath.startsWith(projectSource)) { 0347 filePath.replace(projectSource, "${SOURCE_DIR}/"); 0348 } 0349 d->foundPkgs.emplace_back(FoundPackage{parent, qualifiedName, filePath.toStdString(), ""}); 0350 d->foundPkgNames.insert(std::move(qualifiedName)); 0351 } 0352 0353 return normalisedPath.string(); 0354 } 0355 0356 lvtmdb::PackageObject *FilesystemScanner::addPackage(IncrementalResult& out, 0357 std::unordered_set<lvtmdb::PackageObject *>& existingPkgs, 0358 const std::string& qualifiedName, 0359 const std::string& parentName, 0360 const std::string& filePath, 0361 const std::string& repositoryName) 0362 { 0363 lvtmdb::PackageObject *pkg = d->memDb.getPackage(qualifiedName); 0364 if (pkg) { 0365 existingPkgs.insert(pkg); 0366 return pkg; 0367 } 0368 0369 const std::filesystem::path path(qualifiedName); 0370 std::string name = path.filename().string(); 0371 0372 lvtmdb::PackageObject *parent = nullptr; 0373 if (!parentName.empty()) { 0374 // we can't recurse in the common case, otherwise existingPkgs 0375 // would get false positives 0376 parent = d->memDb.getPackage(parentName); 0377 if (!parent) { 0378 // assumes parent has no parent (it is a package group) 0379 parent = addPackage(out, existingPkgs, parentName, std::string{}, filePath, repositoryName); 0380 } 0381 assert(parent); 0382 } 0383 0384 lvtmdb::RepositoryObject *repo = d->memDb.getOrAddRepository(repositoryName, ""); 0385 0386 out.newPkgs.push_back(qualifiedName); 0387 lvtmdb::PackageObject *thisPkg = d->memDb.getOrAddPackage(qualifiedName, std::move(name), filePath, parent, repo); 0388 0389 if (repo) { 0390 repo->withRWLock([&] { 0391 repo->addChild(thisPkg); 0392 }); 0393 } 0394 0395 if (parent) { 0396 parent->withRWLock([&] { 0397 parent->addChild(thisPkg); 0398 }); 0399 } 0400 0401 return thisPkg; 0402 } 0403 0404 FilesystemScanner::IncrementalResult FilesystemScanner::addToDatabase() 0405 { 0406 auto lock = d->memDb.rwLock(); 0407 0408 IncrementalResult out; 0409 0410 // track what already existed so we can find deleted things 0411 const std::vector<lvtmdb::PackageObject *> allDbPkgs = d->memDb.getAllPackages(); 0412 const std::vector<lvtmdb::FileObject *> allDbFiles = d->memDb.getAllFiles(); 0413 std::unordered_set<lvtmdb::PackageObject *> existingPkgs; 0414 std::unordered_set<lvtmdb::FileObject *> existingFiles; 0415 0416 // add repositories 0417 for (const auto& helper : d->foundRepositories) { 0418 (void) d->memDb.getOrAddRepository(helper.qualifiedName, helper.path); 0419 } 0420 0421 // add package groups 0422 for (const auto& [name, helper] : d->foundPkgGrps) { 0423 addPackage(out, existingPkgs, name, std::string{}, helper.filePath, helper.parentRepositoryName); 0424 } 0425 0426 // add packages 0427 for (auto&& pkg : d->foundPkgs) { 0428 addPackage(out, existingPkgs, pkg.qualifiedName, pkg.parent, pkg.filePath, pkg.repositoryName); 0429 } 0430 0431 // add files 0432 for (const FoundThing& file : d->foundFiles) { 0433 lvtmdb::PackageObject *parent = d->memDb.getPackage(file.parent); 0434 assert(parent || file.parent.empty()); 0435 0436 const std::filesystem::path path = ClpUtil::normalisePath(file.qualifiedName, d->prefix).string(); 0437 0438 std::filesystem::path fullPath = d->prefix / path; 0439 auto hash = [&fullPath]() -> std::string { 0440 auto result = llvm::sys::fs::md5_contents(fullPath.string()); 0441 if (result) { 0442 return result.get().digest().str().str(); 0443 } 0444 0445 // allow failure to hash file contents because we use memory mapped files in tests 0446 return ""; 0447 }(); 0448 0449 lvtmdb::FileObject *filePtr = d->memDb.getFile(path.string()); 0450 if (!filePtr) { 0451 const FileType type = ClpUtil::categorisePath(path.string()); 0452 bool isHeader; 0453 if (type == FileType::e_Header) { 0454 isHeader = true; 0455 } else if (type == FileType::e_Source) { 0456 isHeader = false; 0457 } else if (type == FileType::e_KnownUnknown) { 0458 continue; 0459 } else { // type == FileType::e_UnknownUnknown 0460 d->memDb.getOrAddError(lvtmdb::MdbUtil::ErrorKind::ParserError, 0461 "", 0462 "Unknown file extension", 0463 path.string()); 0464 continue; 0465 } 0466 0467 // create or fetch the component for this file 0468 lvtmdb::ComponentObject *comp = ComponentUtil::addComponent(path, parent, d->memDb); 0469 0470 filePtr = 0471 d->memDb.getOrAddFile(path.string(), path.filename().string(), isHeader, std::move(hash), parent, comp); 0472 out.newFiles.push_back(path.string()); 0473 comp->withRWLock([&] { 0474 comp->addFile(filePtr); 0475 }); 0476 parent->withRWLock([&] { 0477 parent->addComponent(comp); 0478 }); 0479 0480 } else { 0481 existingFiles.insert(filePtr); 0482 0483 // allowing !inCdb when it ended up in the database anyway 0484 // (for example via an #include) 0485 0486 filePtr->withRWLock([&] { 0487 if (hash != filePtr->hash()) { 0488 if (d->catchCodeAnalysisOutput) { 0489 qDebug() << "Found modified file " << path.string(); 0490 } 0491 out.modifiedFiles.push_back(filePtr->qualifiedName()); 0492 filePtr->setHash(std::move(hash)); 0493 } 0494 }); 0495 } 0496 } 0497 0498 // which packages and package groups were deleted? 0499 for (lvtmdb::PackageObject *pkg : allDbPkgs) { 0500 std::string qualifiedName; 0501 pkg->withROLock([&] { 0502 qualifiedName = pkg->qualifiedName(); 0503 }); 0504 if (!d->cdb.containsPackage(qualifiedName)) { 0505 // that package isn't in the code database so we will never find it 0506 // in the scan anyway 0507 continue; 0508 } 0509 const auto it = existingPkgs.find(pkg); 0510 if (it == existingPkgs.end()) { 0511 if (d->catchCodeAnalysisOutput) { 0512 qDebug() << "Package deleted: " << qualifiedName; 0513 } 0514 out.deletedPkgs.push_back(qualifiedName); 0515 } 0516 } 0517 0518 // which files were deleted? 0519 for (lvtmdb::FileObject *file : allDbFiles) { 0520 std::string qualifiedName; 0521 file->withROLock([&] { 0522 qualifiedName = file->qualifiedName(); 0523 }); 0524 if (!d->cdb.containsFile(qualifiedName)) { 0525 // we never find system paths in existing files because this 0526 // filesystem visitor does not traverse them (causing us to always 0527 // think system files have been deleted) 0528 continue; 0529 } 0530 0531 const auto it = existingFiles.find(file); 0532 if (it == existingFiles.end()) { 0533 file->withROLock([&] { 0534 if (d->catchCodeAnalysisOutput) { 0535 qDebug() << "File deleted: " << file->qualifiedName(); 0536 } 0537 out.deletedFiles.push_back(file->qualifiedName()); 0538 }); 0539 } 0540 } 0541 0542 // we've processed everything we added. Clear everything so we are ready for 0543 // the next scan 0544 d->foundFiles.clear(); 0545 d->foundPkgs.clear(); 0546 d->foundPkgGrps.clear(); 0547 0548 return out; 0549 } 0550 0551 } // namespace Codethink::lvtclp