Derived dataset docs generation - refactor and support READMEs (#2559)
* Refactor derived dataset generation script - more pathlib functions * Support View READMEs in generated docs
This commit is contained in:
Родитель
1a241bfde8
Коммит
d2ebd03755
|
@ -1,6 +1,5 @@
|
|||
"""Generate documentation for derived datasets."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
@ -10,6 +9,7 @@ from bigquery_etl.dependency import extract_table_references
|
|||
|
||||
VIEW_FILE = "view.sql"
|
||||
METADATA_FILE = "metadata.yaml"
|
||||
README_FILE = "README.md"
|
||||
NON_USER_FACING_DATASET_SUFFIXES = (
|
||||
"_derived",
|
||||
"_external",
|
||||
|
@ -19,54 +19,29 @@ NON_USER_FACING_DATASET_SUFFIXES = (
|
|||
SOURCE_URL = "https://github.com/mozilla/bigquery-etl/blob/generated-sql"
|
||||
|
||||
|
||||
def generate_derived_dataset_docs(out_dir, project_dir):
|
||||
"""Generate documentation for derived datasets."""
|
||||
project_doc_dir = Path(out_dir) / "mozdata"
|
||||
|
||||
# get a list of all user-facing datasets
|
||||
data_sets = [
|
||||
item
|
||||
for item in os.listdir(project_dir)
|
||||
if os.path.isdir(os.path.join(project_dir, item))
|
||||
and all(name not in item for name in NON_USER_FACING_DATASET_SUFFIXES)
|
||||
]
|
||||
|
||||
for table in data_sets:
|
||||
output = []
|
||||
source_urls = {}
|
||||
with open(project_doc_dir / f"{table}.md", "w") as dataset_doc:
|
||||
# Manually set title to prevent Mkdocs from removing
|
||||
# underscores and capitalizing file names
|
||||
# https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
|
||||
dataset_doc.write(f"---\ntitle: {table}\n---\n\n")
|
||||
|
||||
for root, dirs, files in os.walk(Path(project_dir) / table):
|
||||
# show views in an alphabetical order
|
||||
dirs.sort()
|
||||
if dirs:
|
||||
continue
|
||||
dataset_name = root.split("/")[-1]
|
||||
source_urls["Source Directory"] = f"{SOURCE_URL}/{root}"
|
||||
referenced_tables = []
|
||||
|
||||
def _get_metadata(table_path):
|
||||
metadata = {}
|
||||
if METADATA_FILE in files:
|
||||
source_urls[
|
||||
"Metadata File"
|
||||
] = f"{SOURCE_URL}/{root}/{METADATA_FILE}"
|
||||
with open(os.path.join(root, METADATA_FILE)) as stream:
|
||||
metadata_file = table_path / METADATA_FILE
|
||||
if metadata_file.exists():
|
||||
with open(metadata_file) as stream:
|
||||
try:
|
||||
metadata = yaml.safe_load(stream)
|
||||
except yaml.YAMLError as error:
|
||||
print(error)
|
||||
if VIEW_FILE in files:
|
||||
source_urls["View Definition"] = f"{SOURCE_URL}/{root}/{VIEW_FILE}"
|
||||
view_file = Path(os.path.join(root, VIEW_FILE))
|
||||
referenced_tables = []
|
||||
return metadata
|
||||
|
||||
for referenced_table in extract_table_references(
|
||||
view_file.read_text()
|
||||
):
|
||||
|
||||
def _get_readme_content(table_path):
|
||||
readme_file = table_path / README_FILE
|
||||
if readme_file.exists():
|
||||
return readme_file.read_text()
|
||||
|
||||
|
||||
def _get_referenced_tables_from_view(table_path):
|
||||
referenced_tables = []
|
||||
view_file = table_path / VIEW_FILE
|
||||
if view_file.exists():
|
||||
for referenced_table in extract_table_references(view_file.read_text()):
|
||||
table_split = referenced_table.split(".")
|
||||
if len(table_split) == 2:
|
||||
# missing project ID, retrieve from file path
|
||||
|
@ -84,20 +59,68 @@ def generate_derived_dataset_docs(out_dir, project_dir):
|
|||
"table_id": table_id,
|
||||
}
|
||||
)
|
||||
return referenced_tables
|
||||
|
||||
file_loader = FileSystemLoader(
|
||||
"bigquery_etl/docs/derived_datasets/templates"
|
||||
)
|
||||
# Set up a new template environment
|
||||
env = Environment(loader=file_loader)
|
||||
# Create template with the markdown source text
|
||||
template = env.get_template("table.md")
|
||||
|
||||
def _iter_table_markdown(table_paths, template):
|
||||
for table_path in table_paths:
|
||||
source_urls = {"Source Directory": f"{SOURCE_URL}/{str(table_path)}"}
|
||||
|
||||
referenced_tables = _get_referenced_tables_from_view(table_path)
|
||||
if referenced_tables:
|
||||
source_urls[
|
||||
"View Definition"
|
||||
] = f"{SOURCE_URL}/{str(table_path / VIEW_FILE)}"
|
||||
|
||||
metadata = _get_metadata(table_path)
|
||||
if metadata:
|
||||
source_urls[
|
||||
"Metadata File"
|
||||
] = f"{SOURCE_URL}/{str(table_path / METADATA_FILE)}"
|
||||
|
||||
readme_content = _get_readme_content(table_path)
|
||||
|
||||
output = template.render(
|
||||
metadata=metadata,
|
||||
table_name=dataset_name,
|
||||
readme_content=readme_content,
|
||||
table_name=table_path.name,
|
||||
source_urls=source_urls,
|
||||
referenced_tables=referenced_tables,
|
||||
project_url=f"{SOURCE_URL}/sql",
|
||||
)
|
||||
dataset_doc.write(output)
|
||||
|
||||
yield output
|
||||
|
||||
|
||||
def generate_derived_dataset_docs(out_dir, project_dir):
|
||||
"""Generate documentation for derived datasets."""
|
||||
output_path = Path(out_dir) / "mozdata"
|
||||
project_path = Path(project_dir)
|
||||
|
||||
# get a list of all user-facing datasets
|
||||
dataset_paths = sorted(
|
||||
[
|
||||
dataset_path
|
||||
for dataset_path in project_path.iterdir()
|
||||
if dataset_path.is_dir()
|
||||
and all(
|
||||
suffix not in str(dataset_path)
|
||||
for suffix in NON_USER_FACING_DATASET_SUFFIXES
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
for dataset_path in dataset_paths:
|
||||
table_paths = sorted([path for path in dataset_path.iterdir() if path.is_dir()])
|
||||
|
||||
file_loader = FileSystemLoader("bigquery_etl/docs/derived_datasets/templates")
|
||||
env = Environment(loader=file_loader)
|
||||
template = env.get_template("table.md")
|
||||
|
||||
with open(output_path / f"{dataset_path.name}.md", "w") as dataset_doc:
|
||||
# Manually set title to prevent Mkdocs from removing
|
||||
# underscores and capitalizing file names
|
||||
# https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
|
||||
dataset_doc.write(f"---\ntitle: {dataset_path.name}\n---\n\n")
|
||||
|
||||
dataset_doc.write("".join(_iter_table_markdown(table_paths, template)))
|
||||
|
|
|
@ -21,7 +21,11 @@
|
|||
{%- endfor %}
|
||||
{% endif %}
|
||||
|
||||
{% if readme_content -%}
|
||||
|
||||
{{ readme_content }}
|
||||
|
||||
{% endif %}
|
||||
{% if referenced_tables -%}
|
||||
<table>
|
||||
<caption>Referenced Tables</caption>
|
||||
|
|
Загрузка…
Ссылка в новой задаче