Basic script to generate shared code metrics

This commit is contained in:
Calum Grant 2023-01-27 18:49:14 +00:00
Родитель f4cb920624
Коммит 7d8b624a71
1 изменённых файлов: 329 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,329 @@
# Generates a report on the amount of code sharing in this repo
#
# The purpose of this is
# a) To be able to understand the structure and dependencies
# b) To provide a metric that measures the amount of shared vs non-shared code
import datetime
from pathlib import Path
import json
import yaml
# To add more languages, add them to this list:
languages = ['cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ql', 'ruby', 'swift']
repo_location = Path(__file__).parent.parent.parent
# Gets the total number of lines in a file
def linecount(file):
with open(file, 'r') as fp: return len(fp.readlines())
# Gets the language name from the path
def get_language(path):
return path.parts[len(repo_location.parts)]
# Is this path a CodeQL query file
def is_query(path):
return path.suffix == '.ql'
# Is this path a CodeQL library file
def is_library(path):
return path.suffix == '.qll'
# Is this path a relevant CodeQL file
def is_ql(path):
return is_query(path) or is_library(path)
# Is this file a CodeQL package file
def is_package(path):
return path.name == 'qlpack.yml'
# A CodeQL source file
class QlFile:
def __init__(self, path):
self.path = path
self.lines = linecount(path)
shared = False
def language(self):
return get_language(self.path)
def query(self):
return is_query(self.path)
def library(self):
return is_library(self.path)
# Returns if this qlfile is not shared, and is in a pack that is only in one language
def isOnlyInLanguage(self, language):
return not self.shared and (self.package is None or self.package.languages == {language}) and self.language() == language
# Represents a language folder
class Language:
def __init__(self, name):
self.name = name
self.packs = []
self.nonshared_files = 0
self.nonshared_lines = 0
self.imported_files = 0
self.imported_lines = 0
def addQlFile(self, qlfile):
if not qlfile.shared:
self.nonshared_files += 1
self.nonshared_lines += qlfile.lines
def addSharedAsset(self, package):
self.imported_files += package.files
self.imported_lines += package.lines
# A shared package or file
class SharedAsset:
def __init__(self, name):
self.name = name
# A file shared using identical-files.json
class IdenticalFileSet(SharedAsset):
def __init__(self, name, ql_files):
self.name = name
self.languages = set()
self.files = 0
self.lines = 0
for file in ql_files:
file.package = self
file.shared = True
self.files = 1
self.lines = file.lines
self.languages.add(file.language())
# Gets a pretty-printed markdown link
def link(self):
return self.name
# Represents all files shared in `identical-files.json`
# Reads the file and builds a list of assets
class IdenticalFiles:
def __init__(self, repo_location, ql_file_index):
identical_files = repo_location/'config'/'identical-files.json'
with open(identical_files, "r") as fp:
identical_files_json = json.load(fp)
# Create a list of assets
self.assets = []
for group in identical_files_json:
paths = []
for file in identical_files_json[group]:
path = repo_location / file
if is_ql(path):
ql_file_index[path].shared = True
paths.append(ql_file_index[path])
self.assets.append(IdenticalFileSet(group, paths))
# A package created from a `qlpack.yml`` file
class Package(SharedAsset):
def __init__(self, path, ql_file_index):
self.path = path
self.language = get_language(path)
self.lines = 0
self.files = 0
self.languages = set()
self.languages.add(self.language)
self.identical_files_dependencies = set()
with open(path, 'r') as fp:
y = yaml.safe_load(fp)
if 'name' in y:
self.name = y['name']
else:
self.name = path.parent.name
if 'dependencies' in y:
self.deps = y['dependencies']
if self.deps is None:
self.deps = {}
else:
self.deps = {}
# Mark all relevant files with their package
for file in ql_file_index:
if self.containsDirectory(file):
file = ql_file_index[file]
if not file.shared:
file.package = self
self.lines += file.lines
self.files += 1
else:
self.identical_files_dependencies.add(file.package)
self.url = "https://github.com/github/codeql/blob/main/" + str(path.relative_to(repo_location))
# Gets a pretty-printed markdown link
def link(self):
return '[' + self.name + '](' + self.url + ')'
def containsDirectory(self, dir):
return self.path.parent.parts == dir.parts[:len(self.path.parent.parts)]
# dir.startsWith(self.path.parent)
# Constructs a list of transitive depedencies of this package.
def calculateDependencies(self, packageNameMap):
self.transitive_dependencies = set(self.deps)
queue = list(self.deps)
while len(queue):
item = queue.pop()
for dep2 in packageNameMap[item].deps:
if dep2 not in self.transitive_dependencies:
self.transitive_dependencies.add(dep2)
queue.append(dep2)
# Calculate the amount of imported code
self.total_imported_files = 0
self.total_imported_lines = 0
self.all_dependencies = set(self.identical_files_dependencies)
for dep in self.transitive_dependencies:
self.all_dependencies.add(packageNameMap[dep])
for dep in self.all_dependencies:
self.total_imported_files += dep.files
self.total_imported_lines += dep.lines
dep.languages.add(self.language)
# Create a big index of all files and their line counts.
# Map from path to line count
ql_file_index = {}
package_files = []
# Queue of directories to read
directories_to_scan = [repo_location]
while len(directories_to_scan)!=0:
dir = directories_to_scan.pop()
for p in dir.iterdir():
if p.is_dir():
directories_to_scan.append(p)
elif is_ql(p):
ql_file_index[p] = QlFile(p)
elif is_package(p):
package_files.append(p)
# Create identical_files_json
identical_files = IdenticalFiles(repo_location, ql_file_index)
# Create packages
# Do this after identical_files so that we can figure out the package sizes
# Do this after getting the ql_file_index fully built
packages = []
for file in package_files:
packages.append(Package(file, ql_file_index))
# List all shared assets
shared_assets = packages + identical_files.assets
# Construct statistics for each language
language_info = {}
for l in languages:
language_info[l] = Language(l)
for qlfile in ql_file_index.values():
lang = qlfile.language()
if lang in language_info:
info = language_info[lang]
if qlfile.isOnlyInLanguage(lang):
info.addQlFile(qlfile)
# Determine all package dependencies
packageNameMap = {}
for package in packages:
packageNameMap[package.name] = package
for package in packages:
package.calculateDependencies(packageNameMap)
for asset in shared_assets:
if len(asset.languages)>1:
for lang in asset.languages:
if lang in language_info:
language_info[lang].addSharedAsset(asset)
# Functions to output the results
def list_assets(shared_assets, language_info):
print('| Asset | Files | Lines |', end='')
for lang in language_info:
print('', lang, '|', end='')
print()
print('| ----- | ----- | ----- |', end='')
for lang in language_info:
print(' ---- |', end='')
print()
for asset in shared_assets:
print('|', asset.link(), '|', asset.files ,'|', asset.lines, '|', end=' ')
for lang in language_info:
if lang in asset.languages:
print('yes |', end=' ')
else:
print(' |', end=' ');
print()
print()
def list_package_dependencies(package):
print("Package", package.path, package.name, package.files, package.lines, package.total_imported_files, package.total_imported_lines)
for dep in package.all_dependencies:
print(" ", dep.name, dep.files, dep.lines)
def print_package_dependencies(packages):
print('| Package name | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
print('| ------------ | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
for package in packages:
nlines = package.lines + package.total_imported_lines
shared_percentage = 100 * package.total_imported_lines / nlines if nlines>0 else 0
print('|', package.link(), '|', package.files, '|', package.lines, '|', package.total_imported_files, '|', package.total_imported_lines, '|',
# ','.join([p.name for p in package.all_dependencies]),
"%.2f" % shared_percentage, '|')
print()
def print_language_dependencies(packages):
print_package_dependencies([p for p in packages if p.name.endswith('-all') and p.name.count('-')==1])
def list_shared_code_by_language(language_info):
# For each language directory, list the files that are (1) inside the directory and not shared,
# (2) packages from outside the directory, plus identical files
print('| Language | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
print('| -------- | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
for lang in language_info:
info = language_info[lang]
total = info.imported_lines + info.nonshared_lines
shared_percentage = 100 * info.imported_lines / total if total>0 else 0
print('|', lang, '|', info.nonshared_files, '|', info.nonshared_lines, '|', info.imported_files, '|', info.imported_lines, '|', "%.2f" % shared_percentage, '|')
print()
# Output reports
print('# Report on CodeQL code sharing\n')
print('Generated on', datetime.datetime.now())
print()
print('## Shared code by language\n')
list_shared_code_by_language(language_info)
print('''
* *Non-shared files*: The number of CodeQL files (`.ql`/`.qll`) that are only used within this language folder. Excludes `identical-files.json` that are shared between multiple languages.
* *Non-shared lines of code*: The number of lines of code in the non-shared files.
* *Imported files*: All CodeQL files (`.ql`/`.qll`) files that are transitively used in this language folder, either via packages or `identical-files.json`
* *Imported lines of code*: The number of lines of code in the imported files
* *Shared code %*: The proportion of imported lines / total lines (nonshared + imported).
## Shared packages use by language
A package is *used* if it is a direct or indirect dependency, or a file shared via `identical-files.json`.
''')
list_assets(shared_assets, language_info)
print('## Shared code by language pack\n')
print_language_dependencies(packages)
print('## Shared code by package\n')
print_package_dependencies(packages)