Inject function names and descriptions into docs (#1129)

* Inject function names and descriptions into docs
This commit is contained in:
Jeff Klukas 2020-07-07 14:47:03 -04:00 коммит произвёл GitHub
Родитель 7653acdb6f
Коммит c806d2e730
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
18 изменённых файлов: 71 добавлений и 27 удалений

Просмотреть файл

@ -7,10 +7,12 @@ import re
import shutil
from bigquery_etl.util import standard_args
import yaml
DEFAULT_PROJECTS = ["mozfun/"]
DOCS_FILE = "README.md"
UDF_FILE = "udf.sql"
METADATA_FILE = "metadata.yaml"
DOCS_DIR = "docs/"
INDEX_MD = "index.md"
SQL_REF_RE = r"@sql\((.+)\)"
@ -48,7 +50,7 @@ def load_with_examples(file):
for sql_ref in re.findall(SQL_REF_RE, file_content):
sql_example_file = path / Path(sql_ref)
with open(sql_example_file) as example_sql:
md_sql = f"```sql\n{example_sql.read()}\n```"
md_sql = f"```sql\n{example_sql.read().strip()}\n```"
file_content = file_content.replace(f"@sql({sql_ref})", md_sql)
return file_content
@ -73,7 +75,6 @@ def main():
if DOCS_FILE in files:
# copy doc file to output and replace example references
src = os.path.join(root, DOCS_FILE)
# remove empty strings from path parts
path_parts = list(filter(None, root.split(os.sep)))
name = path_parts[-1]
@ -86,14 +87,30 @@ def main():
dest = project_doc_dir / "overview.md"
dest.write_text(load_with_examples(src))
else:
description = None
if METADATA_FILE in files:
with open(os.path.join(root, METADATA_FILE)) as stream:
try:
description = yaml.safe_load(stream).get(
"description", None
)
except yaml.YAMLError:
pass
# dataset or UDF level doc file
if UDF_FILE in files:
# UDF-level doc; append to dataset doc
dataset_name = os.path.basename(path)
dataset_doc = out_dir / path.parent / f"{dataset_name}.md"
docfile_content = load_with_examples(src)
with open(dataset_doc, "a") as dataset_doc_file:
dataset_doc_file.write("\n\n")
dataset_doc_file.write(load_with_examples(src))
# Inject a level-2 header with the UDF name
dataset_doc_file.write(f"## {name}\n\n")
# Inject the "description" from metadata.yaml
if description:
dataset_doc_file.write(f"{description}\n\n")
# Inject the contents of the README.md
dataset_doc_file.write(docfile_content)
else:
# dataset-level doc; create a new doc file
dest = out_dir / path / f"{name}.md"

Просмотреть файл

@ -1 +1,17 @@
# bits28
# bits28
The `bits28` functions provide an API for working with "bit pattern" INT64
fields, as used in the [`clients_last_seen` dataset](https://docs.telemetry.mozilla.org/datasets/bigquery/clients_last_seen/reference.html)
for desktop Firefox and similar datasets for other applications.
A powerful feature of the `clients_last_seen` methodology is that it doesn't
record specific metrics like MAU and WAU directly, but rather each row stores
a history of the discrete days on which a client was active in the past 28 days.
We could calculate active users in a 10 day or 25 day window just as efficiently
as a 7 day (WAU) or 28 day (MAU) window. But we can also define completely new
metrics based on these usage histories, such as various retention definitions.
The usage history is encoded as a "bit pattern" where the physical
type of the field is a BigQuery INT64, but logically the integer
represents an array of bits, with each 1 indicating a day where the given clients
was active and each 0 indicating a day where the client was inactive.

Просмотреть файл

Просмотреть файл

@ -0,0 +1 @@
@sql(../examples/bits28_days_since_seen.sql)

Просмотреть файл

Просмотреть файл

@ -0,0 +1 @@
@sql(../examples/extract_week_specific_bits.sql)

Просмотреть файл

Просмотреть файл

Просмотреть файл

@ -0,0 +1 @@
@sql(../examples/string_bit_patterns.sql)

Просмотреть файл

@ -0,0 +1,5 @@
SELECT
mozfun.hist.extract(
'{"bucket_count":3,"histogram_type":4,"sum":1,"range":[1,2],"values":{"0":1,"1":0}}'
).sum
-- 1

Просмотреть файл

@ -0,0 +1,3 @@
SELECT
mozfun.hist.extract('5').sum
-- 5

Просмотреть файл

@ -0,0 +1,20 @@
We support a variety of compact encodings as well as the classic JSON
representation as sent in main pings.
The built-in BigQuery JSON parsing functions are not powerful enough to handle
all the logic here, so we resort to some string processing. This function could
behave unexpectedly on poorly-formatted histogram JSON, but we expect that
payload validation in the data pipeline should ensure that histograms are well
formed, which gives us some flexibility.
For more on desktop telemetry histogram structure, see:
- https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html
The compact encodings were originally proposed in:
- https://docs.google.com/document/d/1k_ji_1DB6htgtXnPpMpa7gX0klm-DGV5NMY7KkvVB00/edit#
@sql(../examples/full_hist_extract.sql)
@sql(../examples/use_counter_hist_extract.sql)

Просмотреть файл

@ -1,18 +1,2 @@
friendly_name: Extract Histogram
description: >-
Return a parsed struct from a string-encoded histogram.
We support a variety of compact encodings as well as the classic JSON
representation as sent in main pings.
The built-in BigQuery JSON parsing functions are not powerful enough to handle
all the logic here, so we resort to some string processing. This function could
behave unexpectedly on poorly-formatted histogram JSON, but we expect that
payload validation in the data pipeline should ensure that histograms are well
formed, which gives us some flexibility.
For more on desktop telemetry histogram structure, see:
https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html
The compact encodings were originally proposed in:
https://docs.google.com/document/d/1k_ji_1DB6htgtXnPpMpa7gX0klm-DGV5NMY7KkvVB00/edit#
description: Return a parsed struct from a string-encoded histogram.

Просмотреть файл

Просмотреть файл

@ -1 +0,0 @@
## get_key

Просмотреть файл

@ -1,3 +1 @@
# udf1
@sql(../examples/example1.sql)

Просмотреть файл

@ -1 +0,0 @@
# udf2

Просмотреть файл

@ -20,7 +20,7 @@ class TestGenerateDocs:
)
result = load_with_examples(str(input)).strip()
assert result == "# udf1\n\n```sql\nSELECT\n *\nFROM\n test\n\n```"
assert result == "```sql\nSELECT\n *\nFROM\n test\n```"
def test_load_with_examples_dataset(self):
input = (
@ -33,7 +33,7 @@ class TestGenerateDocs:
)
result = load_with_examples(str(input)).strip()
assert result == "# test_dataset1\n\n```sql\nSELECT\n *\nFROM\n test\n\n```"
assert result == "# test_dataset1\n\n```sql\nSELECT\n *\nFROM\n test\n```"
def test_load_with_missing_example(self, tmp_path):
file_path = tmp_path / "ds" / "udf"