Support dataset READMEs and metadata, cosmetic changes (#2579)

This commit is contained in:
Alexander Nicholson 2021-12-16 14:28:34 -05:00 коммит произвёл GitHub
Родитель 2a20483099
Коммит 73d6256799
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 57 добавлений и 27 удалений

Просмотреть файл

@ -1,14 +1,17 @@
"""Generate documentation for derived datasets.""" """Generate documentation for derived datasets."""
import logging
from pathlib import Path from pathlib import Path
import yaml
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from bigquery_etl.dependency import extract_table_references from bigquery_etl.dependency import extract_table_references
from bigquery_etl.metadata.parse_metadata import DatasetMetadata, Metadata
logging.basicConfig(format="%(levelname)s (%(filename)s:%(lineno)d) - %(message)s")
VIEW_FILE = "view.sql" VIEW_FILE = "view.sql"
METADATA_FILE = "metadata.yaml" METADATA_FILE = "metadata.yaml"
DATASET_METADATA_FILE = "dataset_metadata.yaml"
README_FILE = "README.md" README_FILE = "README.md"
NON_USER_FACING_DATASET_SUFFIXES = ( NON_USER_FACING_DATASET_SUFFIXES = (
"_derived", "_derived",
@ -19,20 +22,23 @@ NON_USER_FACING_DATASET_SUFFIXES = (
SOURCE_URL = "https://github.com/mozilla/bigquery-etl/blob/generated-sql" SOURCE_URL = "https://github.com/mozilla/bigquery-etl/blob/generated-sql"
def _get_metadata(table_path): def _get_metadata(path, metadata_filename=METADATA_FILE):
metadata = {} metadata_path = path / metadata_filename
metadata_file = table_path / METADATA_FILE try:
if metadata_file.exists(): if metadata_filename == METADATA_FILE:
with open(metadata_file) as stream: metadata = Metadata.from_file(metadata_path)
try: return metadata
metadata = yaml.safe_load(stream) elif metadata_filename == DATASET_METADATA_FILE:
except yaml.YAMLError as error: metadata = DatasetMetadata.from_file(metadata_path)
print(error) return metadata
return metadata else:
raise Exception(f"Invalid metadata filename provided - {metadata_filename}")
except FileNotFoundError:
logging.warning(f"Metadata not found at {str(metadata_path)}")
def _get_readme_content(table_path): def _get_readme_content(path):
readme_file = table_path / README_FILE readme_file = path / README_FILE
if readme_file.exists(): if readme_file.exists():
return readme_file.read_text() return readme_file.read_text()
@ -84,6 +90,7 @@ def _iter_table_markdown(table_paths, template):
metadata=metadata, metadata=metadata,
readme_content=readme_content, readme_content=readme_content,
table_name=table_path.name, table_name=table_path.name,
qualified_table_name=f"{table_path.parent.name}.{table_path.name}",
source_urls=source_urls, source_urls=source_urls,
referenced_tables=referenced_tables, referenced_tables=referenced_tables,
project_url=f"{SOURCE_URL}/sql", project_url=f"{SOURCE_URL}/sql",
@ -115,12 +122,28 @@ def generate_derived_dataset_docs(out_dir, project_dir):
file_loader = FileSystemLoader("bigquery_etl/docs/derived_datasets/templates") file_loader = FileSystemLoader("bigquery_etl/docs/derived_datasets/templates")
env = Environment(loader=file_loader) env = Environment(loader=file_loader)
template = env.get_template("table.md") table_template = env.get_template("table.md")
dataset_header_template = env.get_template("dataset_header.md")
dataset_metadata = _get_metadata(
dataset_path, metadata_filename=DATASET_METADATA_FILE
)
dataset_readme_content = _get_readme_content(dataset_path)
with open(output_path / f"{dataset_path.name}.md", "w") as dataset_doc: with open(output_path / f"{dataset_path.name}.md", "w") as dataset_doc:
# Manually set title to prevent Mkdocs from removing # In the template, we manually set title to prevent Mkdocs from removing
# underscores and capitalizing file names # underscores and capitalizing file names
# https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801 # https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
dataset_doc.write(f"---\ntitle: {dataset_path.name}\n---\n\n") dataset_header = dataset_header_template.render(
title=dataset_metadata.friendly_name
if dataset_metadata
else dataset_path.name,
description=dataset_metadata.description if dataset_metadata else None,
readme_content=dataset_readme_content,
source_url=f"{SOURCE_URL}/{str(dataset_path)}",
)
dataset_doc.write("".join(_iter_table_markdown(table_paths, template))) dataset_doc.write(dataset_header)
dataset_doc.write(
"".join(_iter_table_markdown(table_paths, table_template))
)

Просмотреть файл

@ -0,0 +1,8 @@
---
title: {{ title }}
---
{{ description or ""}}
{{ readme_content or ""}}
[Source Directory]({{ source_url }})

Просмотреть файл

@ -1,11 +1,14 @@
## [{{ table_name }}](#{{ table_name }}) ## [{{ table_name }}](#{{ table_name }})
{{ metadata.description | e }}
{% if metadata.friendly_name -%} {% if metadata.friendly_name -%}
* Friendly name: {{metadata.friendly_name}} **{{ metadata.friendly_name }}**
{% endif -%} {% endif -%}
`{{ qualified_table_name }}`
{{ metadata.description | e }}
{% if metadata.labels -%} {% if metadata.labels -%}
{% if metadata.labels.schedule -%} {% if metadata.labels.schedule -%}
* Schedule: {{metadata.labels.schedule}} * Schedule: {{metadata.labels.schedule}}
@ -21,11 +24,8 @@
{%- endfor %} {%- endfor %}
{% endif %} {% endif %}
{% if readme_content -%} {{ readme_content or "" }}
{{ readme_content }}
{% endif %}
{% if referenced_tables -%} {% if referenced_tables -%}
<table> <table>
<caption>Referenced Tables</caption> <caption>Referenced Tables</caption>

Просмотреть файл

@ -1,4 +1,3 @@
Legacy UDFs ## Legacy UDFs
===
This directory contains compatibility functions for query migrations from Athena/Presto, and is named `udf_legacy` to discourage their ongoing use. This directory contains compatibility functions for query migrations from Athena/Presto, and is named `udf_legacy` to discourage their ongoing use.