DENG-1193 Deprecate generated dataset docs (#4657)
This commit is contained in:
Родитель
2a576e03d4
Коммит
7a80984757
|
@ -10,9 +10,6 @@ from pathlib import Path
|
|||
import click
|
||||
|
||||
from bigquery_etl.config import ConfigLoader
|
||||
from bigquery_etl.docs.derived_datasets.generate_derived_dataset_docs import (
|
||||
generate_derived_dataset_docs,
|
||||
)
|
||||
from bigquery_etl.dryrun import DryRun
|
||||
|
||||
EXAMPLE_DIR = "examples"
|
||||
|
@ -89,8 +86,6 @@ def generate(project_dirs, docs_dir, output_dir, log_level):
|
|||
# generate docs
|
||||
for project_dir in project_dirs:
|
||||
generate_udf_docs(out_dir, project_dir)
|
||||
if "mozfun" not in project_dir:
|
||||
generate_derived_dataset_docs(out_dir, project_dir)
|
||||
|
||||
|
||||
@docs_.command("validate", help="Validate the project docs.")
|
||||
|
|
|
@ -1,161 +0,0 @@
|
|||
"""Generate documentation for derived datasets."""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
from bigquery_etl.config import ConfigLoader
|
||||
from bigquery_etl.dependency import extract_table_references
|
||||
from bigquery_etl.metadata.parse_metadata import DatasetMetadata, Metadata
|
||||
from bigquery_etl.schema import Schema
|
||||
|
||||
logging.basicConfig(format="%(levelname)s (%(filename)s:%(lineno)d) - %(message)s")
|
||||
|
||||
VIEW_FILE = "view.sql"
|
||||
METADATA_FILE = "metadata.yaml"
|
||||
SCHEMA_FILE = "schema.yaml"
|
||||
DATASET_METADATA_FILE = "dataset_metadata.yaml"
|
||||
README_FILE = "README.md"
|
||||
|
||||
|
||||
def _get_metadata(path, metadata_filename=METADATA_FILE):
|
||||
metadata_path = path / metadata_filename
|
||||
try:
|
||||
if metadata_filename == METADATA_FILE:
|
||||
metadata = Metadata.from_file(metadata_path)
|
||||
return metadata
|
||||
elif metadata_filename == DATASET_METADATA_FILE:
|
||||
metadata = DatasetMetadata.from_file(metadata_path)
|
||||
return metadata
|
||||
else:
|
||||
raise Exception(f"Invalid metadata filename provided - {metadata_filename}")
|
||||
except FileNotFoundError:
|
||||
logging.warning(f"Metadata not found at {str(metadata_path)}")
|
||||
|
||||
|
||||
def _get_readme_content(path):
|
||||
readme_file = path / README_FILE
|
||||
if readme_file.exists():
|
||||
return readme_file.read_text()
|
||||
|
||||
|
||||
def _get_referenced_tables_from_view(table_path):
|
||||
referenced_tables = []
|
||||
view_file = table_path / VIEW_FILE
|
||||
if view_file.exists():
|
||||
for referenced_table in extract_table_references(view_file.read_text()):
|
||||
table_split = referenced_table.split(".")
|
||||
if len(table_split) == 2:
|
||||
# missing project ID, retrieve from file path
|
||||
[dataset_id, table_id] = table_split
|
||||
project_id = view_file.parent.parent.parent.name
|
||||
elif len(table_split) == 3:
|
||||
[project_id, dataset_id, table_id] = table_split
|
||||
else:
|
||||
continue
|
||||
|
||||
referenced_tables.append(
|
||||
{
|
||||
"project_id": project_id,
|
||||
"dataset_id": dataset_id,
|
||||
"table_id": table_id,
|
||||
}
|
||||
)
|
||||
return referenced_tables
|
||||
|
||||
|
||||
def _get_schema(table_path):
|
||||
schema_path = table_path / SCHEMA_FILE
|
||||
try:
|
||||
schema = Schema.from_schema_file(schema_path)
|
||||
return schema.schema.get("fields")
|
||||
except Exception as e:
|
||||
logging.warning(f"Unable to open schema: {e}")
|
||||
|
||||
|
||||
def _iter_table_markdown(table_paths, template):
|
||||
source_url = ConfigLoader.get("docs", "source_url")
|
||||
for table_path in table_paths:
|
||||
source_urls = {"Source Directory": f"{source_url}/{str(table_path)}"}
|
||||
|
||||
referenced_tables = _get_referenced_tables_from_view(table_path)
|
||||
if referenced_tables:
|
||||
source_urls[
|
||||
"View Definition"
|
||||
] = f"{source_url}/{str(table_path / VIEW_FILE)}"
|
||||
|
||||
metadata = _get_metadata(table_path)
|
||||
if metadata:
|
||||
source_urls[
|
||||
"Metadata File"
|
||||
] = f"{source_url}/{str(table_path / METADATA_FILE)}"
|
||||
|
||||
readme_content = _get_readme_content(table_path)
|
||||
schema = _get_schema(table_path)
|
||||
|
||||
output = template.render(
|
||||
metadata=metadata,
|
||||
readme_content=readme_content,
|
||||
schema=schema,
|
||||
table_name=table_path.name,
|
||||
qualified_table_name=f"{table_path.parent.name}.{table_path.name}",
|
||||
source_urls=source_urls,
|
||||
referenced_tables=referenced_tables,
|
||||
project_url=f"{source_url}/sql",
|
||||
)
|
||||
|
||||
yield output
|
||||
|
||||
|
||||
def generate_derived_dataset_docs(out_dir, project_dir):
|
||||
"""Generate documentation for derived datasets."""
|
||||
output_path = Path(out_dir) / ConfigLoader.get(
|
||||
"default", "user_facing_project", fallback="mozdata"
|
||||
)
|
||||
project_path = Path(project_dir)
|
||||
|
||||
# get a list of all user-facing datasets
|
||||
dataset_paths = sorted(
|
||||
[
|
||||
dataset_path
|
||||
for dataset_path in project_path.iterdir()
|
||||
if dataset_path.is_dir()
|
||||
and all(
|
||||
suffix not in str(dataset_path)
|
||||
for suffix in ConfigLoader.get(
|
||||
"default", "non_user_facing_dataset_suffixes", fallback=[]
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
for dataset_path in dataset_paths:
|
||||
table_paths = sorted([path for path in dataset_path.iterdir() if path.is_dir()])
|
||||
|
||||
file_loader = FileSystemLoader("bigquery_etl/docs/derived_datasets/templates")
|
||||
env = Environment(loader=file_loader)
|
||||
table_template = env.get_template("table.md")
|
||||
dataset_header_template = env.get_template("dataset_header.md")
|
||||
|
||||
dataset_metadata = _get_metadata(
|
||||
dataset_path, metadata_filename=DATASET_METADATA_FILE
|
||||
)
|
||||
dataset_readme_content = _get_readme_content(dataset_path)
|
||||
|
||||
with open(output_path / f"{dataset_path.name}.md", "w") as dataset_doc:
|
||||
# In the template, we manually set title to prevent Mkdocs from removing
|
||||
# underscores and capitalizing file names
|
||||
# https://github.com/mkdocs/mkdocs/issues/1915#issuecomment-561311801
|
||||
dataset_header = dataset_header_template.render(
|
||||
title=dataset_metadata.friendly_name
|
||||
if dataset_metadata
|
||||
else dataset_path.name,
|
||||
description=dataset_metadata.description if dataset_metadata else None,
|
||||
readme_content=dataset_readme_content,
|
||||
source_url=f"{ConfigLoader.get('docs', 'source_url')}/{str(dataset_path)}",
|
||||
)
|
||||
|
||||
dataset_doc.write(dataset_header)
|
||||
dataset_doc.write(
|
||||
"".join(_iter_table_markdown(table_paths, table_template))
|
||||
)
|
|
@ -1,8 +0,0 @@
|
|||
---
|
||||
title: {{ title }}
|
||||
---
|
||||
{{ description or ""}}
|
||||
{{ readme_content or ""}}
|
||||
|
||||
[Source Directory]({{ source_url }})
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
## [{{ table_name }}](#{{ table_name }})
|
||||
|
||||
{% if metadata.friendly_name -%}
|
||||
**{{ metadata.friendly_name }}**
|
||||
|
||||
{% endif -%}
|
||||
|
||||
`{{ qualified_table_name }}`
|
||||
|
||||
{{ metadata.description or "" | e }}
|
||||
|
||||
{% if metadata.labels -%}
|
||||
{% if metadata.labels.schedule -%}
|
||||
* Schedule: {{metadata.labels.schedule}}
|
||||
{% endif -%}
|
||||
{% endif -%}
|
||||
|
||||
{% if metadata.owners -%}
|
||||
* Owners:
|
||||
{% for email in metadata.owners -%}
|
||||
{% filter indent(width=4) %}
|
||||
- [{{email}}](mailto:{{email}})
|
||||
{% endfilter %}
|
||||
{%- endfor %}
|
||||
{% endif %}
|
||||
|
||||
{{ readme_content or "" }}
|
||||
|
||||
{% if schema -%}
|
||||
|
||||
<table>
|
||||
<caption>Schema</caption>
|
||||
<tr>
|
||||
<th>Column</th>
|
||||
<th>Description</th>
|
||||
<th>Type</th>
|
||||
<th>Nullable</th>
|
||||
</tr>
|
||||
{% for field in schema -%}
|
||||
<tr>
|
||||
<td>{{ field.name }}</td>
|
||||
<td>{{ field.description or "" }}</td>
|
||||
<td>{{ field.type | capitalize }}</td>
|
||||
<td>{{ 'Yes' if field.mode == 'NULLABLE' else 'No' }}</td>
|
||||
</tr>
|
||||
{%- endfor %}
|
||||
</table>
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if referenced_tables -%}
|
||||
<table>
|
||||
<caption>Referenced Tables</caption>
|
||||
<tr>
|
||||
<th>Project</th>
|
||||
<th>Dataset</th>
|
||||
<th>Table</th>
|
||||
</tr>
|
||||
{% for table in referenced_tables -%}
|
||||
<tr>
|
||||
<td><a href={{ project_url + "/" + table.project_id }}>{{ table.project_id }}</a></td>
|
||||
<td><a href={{ project_url + "/" + table.project_id+ "/" + table.dataset_id }}>{{ table.dataset_id }}</a></td>
|
||||
<td><a href={{ project_url + "/" + table.project_id + "/" + table.dataset_id + "/" + table.table_id }}>{{ table.table_id }}</a></td>
|
||||
</tr>
|
||||
{%- endfor %}
|
||||
</table>
|
||||
{% endif %}
|
||||
|
||||
{% for key, value in source_urls.items() -%} [{{key}}]({{ value }}) {{ " | " if not loop.last else "" }} {%- endfor %}
|
||||
|
||||
---
|
||||
|
|
@ -32,8 +32,6 @@ plugins:
|
|||
- awesome-pages
|
||||
nav:
|
||||
- index.md
|
||||
- Datasets:
|
||||
- ... | mozdata/**.md
|
||||
- UDFs:
|
||||
- ... | mozfun/**.md
|
||||
- ... | moz-fx-data-shared-prod/**.md
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
nav:
|
||||
- introduction.md
|
||||
- ...
|
|
@ -1,2 +0,0 @@
|
|||
`mozdata` is the main GCP project that includes datasets for user analysis as of February 2021. This project is filled with views that reference underlying tables in `moz-fx-data-shared-prod`. Users can use data tools like [STMO](https://sql.telemetry.mozilla.org/) to issue queries in this project.
|
||||
For more context, see the [Projects, Datasets, and Tables in BigQuery](https://docs.telemetry.mozilla.org/cookbooks/bigquery/querying.html#projects-datasets-and-tables-in-bigquery) article on [docs.telemetry.mozilla.org.](https://docs.telemetry.mozilla.org/)
|
Загрузка…
Ссылка в новой задаче