Derived dataset docs generation - refactor and support READMEs (#2559)

* Refactor derived dataset generation script - more pathlib functions

* Support View READMEs in generated docs
This commit is contained in:
Alexander Nicholson 2021-12-08 19:31:40 -05:00 коммит произвёл GitHub
Родитель 1a241bfde8
Коммит d2ebd03755
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 104 добавлений и 77 удалений

Просмотреть файл

@ -1,6 +1,5 @@
"""Generate documentation for derived datasets."""
import os
from pathlib import Path
import yaml
@ -10,6 +9,7 @@ from bigquery_etl.dependency import extract_table_references
VIEW_FILE = "view.sql"
METADATA_FILE = "metadata.yaml"
README_FILE = "README.md"
NON_USER_FACING_DATASET_SUFFIXES = (
"_derived",
"_external",
@ -19,54 +19,29 @@ NON_USER_FACING_DATASET_SUFFIXES = (
SOURCE_URL = "https://github.com/mozilla/bigquery-etl/blob/generated-sql"
def generate_derived_dataset_docs(out_dir, project_dir):
"""Generate documentation for derived datasets."""
project_doc_dir = Path(out_dir) / "mozdata"
# get a list of all user-facing datasets
data_sets = [
item
for item in os.listdir(project_dir)
if os.path.isdir(os.path.join(project_dir, item))
and all(name not in item for name in NON_USER_FACING_DATASET_SUFFIXES)
]
for table in data_sets:
output = []
source_urls = {}
with open(project_doc_dir / f"{table}.md", "w") as dataset_doc:
# Manually set title to prevent Mkdocs from removing
# underscores and capitalizing file names
# https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
dataset_doc.write(f"---\ntitle: {table}\n---\n\n")
for root, dirs, files in os.walk(Path(project_dir) / table):
# show views in an alphabetical order
dirs.sort()
if dirs:
continue
dataset_name = root.split("/")[-1]
source_urls["Source Directory"] = f"{SOURCE_URL}/{root}"
referenced_tables = []
def _get_metadata(table_path):
metadata = {}
if METADATA_FILE in files:
source_urls[
"Metadata File"
] = f"{SOURCE_URL}/{root}/{METADATA_FILE}"
with open(os.path.join(root, METADATA_FILE)) as stream:
metadata_file = table_path / METADATA_FILE
if metadata_file.exists():
with open(metadata_file) as stream:
try:
metadata = yaml.safe_load(stream)
except yaml.YAMLError as error:
print(error)
if VIEW_FILE in files:
source_urls["View Definition"] = f"{SOURCE_URL}/{root}/{VIEW_FILE}"
view_file = Path(os.path.join(root, VIEW_FILE))
referenced_tables = []
return metadata
for referenced_table in extract_table_references(
view_file.read_text()
):
def _get_readme_content(table_path):
readme_file = table_path / README_FILE
if readme_file.exists():
return readme_file.read_text()
def _get_referenced_tables_from_view(table_path):
referenced_tables = []
view_file = table_path / VIEW_FILE
if view_file.exists():
for referenced_table in extract_table_references(view_file.read_text()):
table_split = referenced_table.split(".")
if len(table_split) == 2:
# missing project ID, retrieve from file path
@ -84,20 +59,68 @@ def generate_derived_dataset_docs(out_dir, project_dir):
"table_id": table_id,
}
)
return referenced_tables
file_loader = FileSystemLoader(
"bigquery_etl/docs/derived_datasets/templates"
)
# Set up a new template environment
env = Environment(loader=file_loader)
# Create template with the markdown source text
template = env.get_template("table.md")
def _iter_table_markdown(table_paths, template):
for table_path in table_paths:
source_urls = {"Source Directory": f"{SOURCE_URL}/{str(table_path)}"}
referenced_tables = _get_referenced_tables_from_view(table_path)
if referenced_tables:
source_urls[
"View Definition"
] = f"{SOURCE_URL}/{str(table_path / VIEW_FILE)}"
metadata = _get_metadata(table_path)
if metadata:
source_urls[
"Metadata File"
] = f"{SOURCE_URL}/{str(table_path / METADATA_FILE)}"
readme_content = _get_readme_content(table_path)
output = template.render(
metadata=metadata,
table_name=dataset_name,
readme_content=readme_content,
table_name=table_path.name,
source_urls=source_urls,
referenced_tables=referenced_tables,
project_url=f"{SOURCE_URL}/sql",
)
dataset_doc.write(output)
yield output
def generate_derived_dataset_docs(out_dir, project_dir):
"""Generate documentation for derived datasets."""
output_path = Path(out_dir) / "mozdata"
project_path = Path(project_dir)
# get a list of all user-facing datasets
dataset_paths = sorted(
[
dataset_path
for dataset_path in project_path.iterdir()
if dataset_path.is_dir()
and all(
suffix not in str(dataset_path)
for suffix in NON_USER_FACING_DATASET_SUFFIXES
)
]
)
for dataset_path in dataset_paths:
table_paths = sorted([path for path in dataset_path.iterdir() if path.is_dir()])
file_loader = FileSystemLoader("bigquery_etl/docs/derived_datasets/templates")
env = Environment(loader=file_loader)
template = env.get_template("table.md")
with open(output_path / f"{dataset_path.name}.md", "w") as dataset_doc:
# Manually set title to prevent Mkdocs from removing
# underscores and capitalizing file names
# https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
dataset_doc.write(f"---\ntitle: {dataset_path.name}\n---\n\n")
dataset_doc.write("".join(_iter_table_markdown(table_paths, template)))

Просмотреть файл

@ -21,7 +21,11 @@
{%- endfor %}
{% endif %}
{% if readme_content -%}
{{ readme_content }}
{% endif %}
{% if referenced_tables -%}
<table>
<caption>Referenced Tables</caption>