Inject function names and descriptions into docs (#1129)
* Inject function names and descriptions into docs
This commit is contained in:
Родитель
7653acdb6f
Коммит
c806d2e730
|
@ -7,10 +7,12 @@ import re
|
|||
import shutil
|
||||
|
||||
from bigquery_etl.util import standard_args
|
||||
import yaml
|
||||
|
||||
DEFAULT_PROJECTS = ["mozfun/"]
|
||||
DOCS_FILE = "README.md"
|
||||
UDF_FILE = "udf.sql"
|
||||
METADATA_FILE = "metadata.yaml"
|
||||
DOCS_DIR = "docs/"
|
||||
INDEX_MD = "index.md"
|
||||
SQL_REF_RE = r"@sql\((.+)\)"
|
||||
|
@ -48,7 +50,7 @@ def load_with_examples(file):
|
|||
for sql_ref in re.findall(SQL_REF_RE, file_content):
|
||||
sql_example_file = path / Path(sql_ref)
|
||||
with open(sql_example_file) as example_sql:
|
||||
md_sql = f"```sql\n{example_sql.read()}\n```"
|
||||
md_sql = f"```sql\n{example_sql.read().strip()}\n```"
|
||||
file_content = file_content.replace(f"@sql({sql_ref})", md_sql)
|
||||
|
||||
return file_content
|
||||
|
@ -73,7 +75,6 @@ def main():
|
|||
if DOCS_FILE in files:
|
||||
# copy doc file to output and replace example references
|
||||
src = os.path.join(root, DOCS_FILE)
|
||||
|
||||
# remove empty strings from path parts
|
||||
path_parts = list(filter(None, root.split(os.sep)))
|
||||
name = path_parts[-1]
|
||||
|
@ -86,14 +87,30 @@ def main():
|
|||
dest = project_doc_dir / "overview.md"
|
||||
dest.write_text(load_with_examples(src))
|
||||
else:
|
||||
description = None
|
||||
if METADATA_FILE in files:
|
||||
with open(os.path.join(root, METADATA_FILE)) as stream:
|
||||
try:
|
||||
description = yaml.safe_load(stream).get(
|
||||
"description", None
|
||||
)
|
||||
except yaml.YAMLError:
|
||||
pass
|
||||
# dataset or UDF level doc file
|
||||
if UDF_FILE in files:
|
||||
# UDF-level doc; append to dataset doc
|
||||
dataset_name = os.path.basename(path)
|
||||
dataset_doc = out_dir / path.parent / f"{dataset_name}.md"
|
||||
docfile_content = load_with_examples(src)
|
||||
with open(dataset_doc, "a") as dataset_doc_file:
|
||||
dataset_doc_file.write("\n\n")
|
||||
dataset_doc_file.write(load_with_examples(src))
|
||||
# Inject a level-2 header with the UDF name
|
||||
dataset_doc_file.write(f"## {name}\n\n")
|
||||
# Inject the "description" from metadata.yaml
|
||||
if description:
|
||||
dataset_doc_file.write(f"{description}\n\n")
|
||||
# Inject the contents of the README.md
|
||||
dataset_doc_file.write(docfile_content)
|
||||
else:
|
||||
# dataset-level doc; create a new doc file
|
||||
dest = out_dir / path / f"{name}.md"
|
||||
|
|
|
@ -1 +1,17 @@
|
|||
# bits28
|
||||
# bits28
|
||||
|
||||
The `bits28` functions provide an API for working with "bit pattern" INT64
|
||||
fields, as used in the [`clients_last_seen` dataset](https://docs.telemetry.mozilla.org/datasets/bigquery/clients_last_seen/reference.html)
|
||||
for desktop Firefox and similar datasets for other applications.
|
||||
|
||||
A powerful feature of the `clients_last_seen` methodology is that it doesn't
|
||||
record specific metrics like MAU and WAU directly, but rather each row stores
|
||||
a history of the discrete days on which a client was active in the past 28 days.
|
||||
We could calculate active users in a 10 day or 25 day window just as efficiently
|
||||
as a 7 day (WAU) or 28 day (MAU) window. But we can also define completely new
|
||||
metrics based on these usage histories, such as various retention definitions.
|
||||
|
||||
The usage history is encoded as a "bit pattern" where the physical
|
||||
type of the field is a BigQuery INT64, but logically the integer
|
||||
represents an array of bits, with each 1 indicating a day where the given clients
|
||||
was active and each 0 indicating a day where the client was inactive.
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
@sql(../examples/bits28_days_since_seen.sql)
|
|
@ -0,0 +1 @@
|
|||
@sql(../examples/extract_week_specific_bits.sql)
|
|
@ -0,0 +1 @@
|
|||
@sql(../examples/string_bit_patterns.sql)
|
|
@ -0,0 +1,5 @@
|
|||
SELECT
|
||||
mozfun.hist.extract(
|
||||
'{"bucket_count":3,"histogram_type":4,"sum":1,"range":[1,2],"values":{"0":1,"1":0}}'
|
||||
).sum
|
||||
-- 1
|
|
@ -0,0 +1,3 @@
|
|||
SELECT
|
||||
mozfun.hist.extract('5').sum
|
||||
-- 5
|
|
@ -0,0 +1,20 @@
|
|||
We support a variety of compact encodings as well as the classic JSON
|
||||
representation as sent in main pings.
|
||||
|
||||
The built-in BigQuery JSON parsing functions are not powerful enough to handle
|
||||
all the logic here, so we resort to some string processing. This function could
|
||||
behave unexpectedly on poorly-formatted histogram JSON, but we expect that
|
||||
payload validation in the data pipeline should ensure that histograms are well
|
||||
formed, which gives us some flexibility.
|
||||
|
||||
For more on desktop telemetry histogram structure, see:
|
||||
|
||||
- https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html
|
||||
|
||||
The compact encodings were originally proposed in:
|
||||
|
||||
- https://docs.google.com/document/d/1k_ji_1DB6htgtXnPpMpa7gX0klm-DGV5NMY7KkvVB00/edit#
|
||||
|
||||
@sql(../examples/full_hist_extract.sql)
|
||||
|
||||
@sql(../examples/use_counter_hist_extract.sql)
|
|
@ -1,18 +1,2 @@
|
|||
friendly_name: Extract Histogram
|
||||
description: >-
|
||||
Return a parsed struct from a string-encoded histogram.
|
||||
|
||||
We support a variety of compact encodings as well as the classic JSON
|
||||
representation as sent in main pings.
|
||||
|
||||
The built-in BigQuery JSON parsing functions are not powerful enough to handle
|
||||
all the logic here, so we resort to some string processing. This function could
|
||||
behave unexpectedly on poorly-formatted histogram JSON, but we expect that
|
||||
payload validation in the data pipeline should ensure that histograms are well
|
||||
formed, which gives us some flexibility.
|
||||
|
||||
For more on desktop telemetry histogram structure, see:
|
||||
https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html
|
||||
|
||||
The compact encodings were originally proposed in:
|
||||
https://docs.google.com/document/d/1k_ji_1DB6htgtXnPpMpa7gX0klm-DGV5NMY7KkvVB00/edit#
|
||||
description: Return a parsed struct from a string-encoded histogram.
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
## get_key
|
|
@ -1,3 +1 @@
|
|||
# udf1
|
||||
|
||||
@sql(../examples/example1.sql)
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
# udf2
|
|
@ -20,7 +20,7 @@ class TestGenerateDocs:
|
|||
)
|
||||
result = load_with_examples(str(input)).strip()
|
||||
|
||||
assert result == "# udf1\n\n```sql\nSELECT\n *\nFROM\n test\n\n```"
|
||||
assert result == "```sql\nSELECT\n *\nFROM\n test\n```"
|
||||
|
||||
def test_load_with_examples_dataset(self):
|
||||
input = (
|
||||
|
@ -33,7 +33,7 @@ class TestGenerateDocs:
|
|||
)
|
||||
result = load_with_examples(str(input)).strip()
|
||||
|
||||
assert result == "# test_dataset1\n\n```sql\nSELECT\n *\nFROM\n test\n\n```"
|
||||
assert result == "# test_dataset1\n\n```sql\nSELECT\n *\nFROM\n test\n```"
|
||||
|
||||
def test_load_with_missing_example(self, tmp_path):
|
||||
file_path = tmp_path / "ds" / "udf"
|
||||
|
|
Загрузка…
Ссылка в новой задаче