From 7d8b624a71d454a155e852c2eacfd08f748d11a4 Mon Sep 17 00:00:00 2001 From: Calum Grant Date: Fri, 27 Jan 2023 18:49:14 +0000 Subject: [PATCH] Basic script to generate shared code metrics --- misc/scripts/shared-code-metrics.py | 329 ++++++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 misc/scripts/shared-code-metrics.py diff --git a/misc/scripts/shared-code-metrics.py b/misc/scripts/shared-code-metrics.py new file mode 100644 index 00000000000..78970b4542b --- /dev/null +++ b/misc/scripts/shared-code-metrics.py @@ -0,0 +1,329 @@ +# Generates a report on the amount of code sharing in this repo +# +# The purpose of this is +# a) To be able to understand the structure and dependencies +# b) To provide a metric that measures the amount of shared vs non-shared code + +import datetime +from pathlib import Path +import json +import yaml + +# To add more languages, add them to this list: +languages = ['cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ql', 'ruby', 'swift'] + +repo_location = Path(__file__).parent.parent.parent + +# Gets the total number of lines in a file +def linecount(file): + with open(file, 'r') as fp: return len(fp.readlines()) + +# Gets the language name from the path +def get_language(path): + return path.parts[len(repo_location.parts)] + +# Is this path a CodeQL query file +def is_query(path): + return path.suffix == '.ql' + +# Is this path a CodeQL library file +def is_library(path): + return path.suffix == '.qll' + +# Is this path a relevant CodeQL file +def is_ql(path): + return is_query(path) or is_library(path) + +# Is this file a CodeQL package file +def is_package(path): + return path.name == 'qlpack.yml' + +# A CodeQL source file +class QlFile: + def __init__(self, path): + self.path = path + self.lines = linecount(path) + shared = False + + def language(self): + return get_language(self.path) + + def query(self): + return is_query(self.path) + + def library(self): + return is_library(self.path) + + # Returns if this qlfile is not shared, and is in a pack that is only in one language + def isOnlyInLanguage(self, language): + return not self.shared and (self.package is None or self.package.languages == {language}) and self.language() == language + +# Represents a language folder +class Language: + def __init__(self, name): + self.name = name + self.packs = [] + self.nonshared_files = 0 + self.nonshared_lines = 0 + self.imported_files = 0 + self.imported_lines = 0 + + def addQlFile(self, qlfile): + if not qlfile.shared: + self.nonshared_files += 1 + self.nonshared_lines += qlfile.lines + + def addSharedAsset(self, package): + self.imported_files += package.files + self.imported_lines += package.lines + +# A shared package or file +class SharedAsset: + def __init__(self, name): + self.name = name + +# A file shared using identical-files.json +class IdenticalFileSet(SharedAsset): + def __init__(self, name, ql_files): + self.name = name + self.languages = set() + self.files = 0 + self.lines = 0 + for file in ql_files: + file.package = self + file.shared = True + self.files = 1 + self.lines = file.lines + self.languages.add(file.language()) + + # Gets a pretty-printed markdown link + def link(self): + return self.name + +# Represents all files shared in `identical-files.json` +# Reads the file and builds a list of assets +class IdenticalFiles: + def __init__(self, repo_location, ql_file_index): + identical_files = repo_location/'config'/'identical-files.json' + with open(identical_files, "r") as fp: + identical_files_json = json.load(fp) + # Create a list of assets + self.assets = [] + for group in identical_files_json: + paths = [] + for file in identical_files_json[group]: + path = repo_location / file + if is_ql(path): + ql_file_index[path].shared = True + paths.append(ql_file_index[path]) + self.assets.append(IdenticalFileSet(group, paths)) + +# A package created from a `qlpack.yml`` file +class Package(SharedAsset): + def __init__(self, path, ql_file_index): + self.path = path + self.language = get_language(path) + self.lines = 0 + self.files = 0 + self.languages = set() + self.languages.add(self.language) + self.identical_files_dependencies = set() + with open(path, 'r') as fp: + y = yaml.safe_load(fp) + if 'name' in y: + self.name = y['name'] + else: + self.name = path.parent.name + if 'dependencies' in y: + self.deps = y['dependencies'] + if self.deps is None: + self.deps = {} + else: + self.deps = {} + # Mark all relevant files with their package + for file in ql_file_index: + if self.containsDirectory(file): + file = ql_file_index[file] + if not file.shared: + file.package = self + self.lines += file.lines + self.files += 1 + else: + self.identical_files_dependencies.add(file.package) + self.url = "https://github.com/github/codeql/blob/main/" + str(path.relative_to(repo_location)) + + # Gets a pretty-printed markdown link + def link(self): + return '[' + self.name + '](' + self.url + ')' + + def containsDirectory(self, dir): + return self.path.parent.parts == dir.parts[:len(self.path.parent.parts)] + # dir.startsWith(self.path.parent) + + # Constructs a list of transitive depedencies of this package. + def calculateDependencies(self, packageNameMap): + self.transitive_dependencies = set(self.deps) + queue = list(self.deps) + while len(queue): + item = queue.pop() + for dep2 in packageNameMap[item].deps: + if dep2 not in self.transitive_dependencies: + self.transitive_dependencies.add(dep2) + queue.append(dep2) + # Calculate the amount of imported code + self.total_imported_files = 0 + self.total_imported_lines = 0 + self.all_dependencies = set(self.identical_files_dependencies) + for dep in self.transitive_dependencies: + self.all_dependencies.add(packageNameMap[dep]) + for dep in self.all_dependencies: + self.total_imported_files += dep.files + self.total_imported_lines += dep.lines + dep.languages.add(self.language) + +# Create a big index of all files and their line counts. + +# Map from path to line count +ql_file_index = {} +package_files = [] + +# Queue of directories to read +directories_to_scan = [repo_location] + +while len(directories_to_scan)!=0: + dir = directories_to_scan.pop() + for p in dir.iterdir(): + if p.is_dir(): + directories_to_scan.append(p) + elif is_ql(p): + ql_file_index[p] = QlFile(p) + elif is_package(p): + package_files.append(p) + +# Create identical_files_json +identical_files = IdenticalFiles(repo_location, ql_file_index) + +# Create packages +# Do this after identical_files so that we can figure out the package sizes +# Do this after getting the ql_file_index fully built +packages = [] +for file in package_files: + packages.append(Package(file, ql_file_index)) + +# List all shared assets +shared_assets = packages + identical_files.assets + +# Construct statistics for each language +language_info = {} +for l in languages: + language_info[l] = Language(l) + +for qlfile in ql_file_index.values(): + lang = qlfile.language() + if lang in language_info: + info = language_info[lang] + if qlfile.isOnlyInLanguage(lang): + info.addQlFile(qlfile) + +# Determine all package dependencies + +packageNameMap = {} + +for package in packages: + packageNameMap[package.name] = package + +for package in packages: + package.calculateDependencies(packageNameMap) + +for asset in shared_assets: + if len(asset.languages)>1: + for lang in asset.languages: + if lang in language_info: + language_info[lang].addSharedAsset(asset) + + +# Functions to output the results + +def list_assets(shared_assets, language_info): + print('| Asset | Files | Lines |', end='') + for lang in language_info: + print('', lang, '|', end='') + print() + print('| ----- | ----- | ----- |', end='') + for lang in language_info: + print(' ---- |', end='') + print() + for asset in shared_assets: + print('|', asset.link(), '|', asset.files ,'|', asset.lines, '|', end=' ') + for lang in language_info: + if lang in asset.languages: + print('yes |', end=' ') + else: + print(' |', end=' '); + print() + print() + +def list_package_dependencies(package): + print("Package", package.path, package.name, package.files, package.lines, package.total_imported_files, package.total_imported_lines) + for dep in package.all_dependencies: + print(" ", dep.name, dep.files, dep.lines) + +def print_package_dependencies(packages): + print('| Package name | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |') + print('| ------------ | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |') + for package in packages: + nlines = package.lines + package.total_imported_lines + shared_percentage = 100 * package.total_imported_lines / nlines if nlines>0 else 0 + print('|', package.link(), '|', package.files, '|', package.lines, '|', package.total_imported_files, '|', package.total_imported_lines, '|', + # ','.join([p.name for p in package.all_dependencies]), + "%.2f" % shared_percentage, '|') + print() + +def print_language_dependencies(packages): + print_package_dependencies([p for p in packages if p.name.endswith('-all') and p.name.count('-')==1]) + +def list_shared_code_by_language(language_info): + # For each language directory, list the files that are (1) inside the directory and not shared, + # (2) packages from outside the directory, plus identical files + print('| Language | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |') + print('| -------- | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |') + for lang in language_info: + info = language_info[lang] + total = info.imported_lines + info.nonshared_lines + shared_percentage = 100 * info.imported_lines / total if total>0 else 0 + print('|', lang, '|', info.nonshared_files, '|', info.nonshared_lines, '|', info.imported_files, '|', info.imported_lines, '|', "%.2f" % shared_percentage, '|') + print() + + +# Output reports + +print('# Report on CodeQL code sharing\n') +print('Generated on', datetime.datetime.now()) +print() + +print('## Shared code by language\n') + +list_shared_code_by_language(language_info) + +print(''' +* *Non-shared files*: The number of CodeQL files (`.ql`/`.qll`) that are only used within this language folder. Excludes `identical-files.json` that are shared between multiple languages. +* *Non-shared lines of code*: The number of lines of code in the non-shared files. +* *Imported files*: All CodeQL files (`.ql`/`.qll`) files that are transitively used in this language folder, either via packages or `identical-files.json` +* *Imported lines of code*: The number of lines of code in the imported files +* *Shared code %*: The proportion of imported lines / total lines (nonshared + imported). + +## Shared packages use by language + +A package is *used* if it is a direct or indirect dependency, or a file shared via `identical-files.json`. + +''') + +list_assets(shared_assets, language_info) + +print('## Shared code by language pack\n') + +print_language_dependencies(packages) + +print('## Shared code by package\n') + +print_package_dependencies(packages)