Inject function names and descriptions into docs (#1129)

* Inject function names and descriptions into docs
2020-07-07 14:47:03 -04:00 · 2020-07-07 14:47:03 -04:00 · c806d2e730
--- a/bigquery_etl/docs/generate_docs.py
+++ b/bigquery_etl/docs/generate_docs.py
@ -7,10 +7,12 @@ import re
 import shutil

 from bigquery_etl.util import standard_args
+import yaml

 DEFAULT_PROJECTS = ["mozfun/"]
 DOCS_FILE = "README.md"
 UDF_FILE = "udf.sql"
+METADATA_FILE = "metadata.yaml"
 DOCS_DIR = "docs/"
 INDEX_MD = "index.md"
 SQL_REF_RE = r"@sql\((.+)\)"
@ -48,7 +50,7 @@ def load_with_examples(file):
        for sql_ref in re.findall(SQL_REF_RE, file_content):
            sql_example_file = path / Path(sql_ref)
            with open(sql_example_file) as example_sql:
-                md_sql = f"```sql\n{example_sql.read()}\n```"
+                md_sql = f"```sql\n{example_sql.read().strip()}\n```"
                file_content = file_content.replace(f"@sql({sql_ref})", md_sql)

    return file_content
@ -73,7 +75,6 @@ def main():
                if DOCS_FILE in files:
                    # copy doc file to output and replace example references
                    src = os.path.join(root, DOCS_FILE)
-
                    # remove empty strings from path parts
                    path_parts = list(filter(None, root.split(os.sep)))
                    name = path_parts[-1]
@ -86,14 +87,30 @@ def main():
                        dest = project_doc_dir / "overview.md"
                        dest.write_text(load_with_examples(src))
                    else:
+                        description = None
+                        if METADATA_FILE in files:
+                            with open(os.path.join(root, METADATA_FILE)) as stream:
+                                try:
+                                    description = yaml.safe_load(stream).get(
+                                        "description", None
+                                    )
+                                except yaml.YAMLError:
+                                    pass
                        # dataset or UDF level doc file
                        if UDF_FILE in files:
                            # UDF-level doc; append to dataset doc
                            dataset_name = os.path.basename(path)
                            dataset_doc = out_dir / path.parent / f"{dataset_name}.md"
+                            docfile_content = load_with_examples(src)
                            with open(dataset_doc, "a") as dataset_doc_file:
                                dataset_doc_file.write("\n\n")
-                                dataset_doc_file.write(load_with_examples(src))
+                                # Inject a level-2 header with the UDF name
+                                dataset_doc_file.write(f"## {name}\n\n")
+                                # Inject the "description" from metadata.yaml
+                                if description:
+                                    dataset_doc_file.write(f"{description}\n\n")
+                                # Inject the contents of the README.md
+                                dataset_doc_file.write(docfile_content)
                        else:
                            # dataset-level doc; create a new doc file
                            dest = out_dir / path / f"{name}.md"
--- a/mozfun/bits28/README.md
+++ b/mozfun/bits28/README.md
@ -1 +1,17 @@
-# bits28
+# bits28
+
+The `bits28` functions provide an API for working with "bit pattern" INT64
+fields, as used in the [`clients_last_seen` dataset](https://docs.telemetry.mozilla.org/datasets/bigquery/clients_last_seen/reference.html)
+for desktop Firefox and similar datasets for other applications.
+
+A powerful feature of the `clients_last_seen` methodology is that it doesn't
+record specific metrics like MAU and WAU directly, but rather each row stores
+a history of the discrete days on which a client was active in the past 28 days.
+We could calculate active users in a 10 day or 25 day window just as efficiently
+as a 7 day (WAU) or 28 day (MAU) window. But we can also define completely new
+metrics based on these usage histories, such as various retention definitions.
+
+The usage history is encoded as a "bit pattern" where the physical
+type of the field is a BigQuery INT64, but logically the integer
+represents an array of bits, with each 1 indicating a day where the given clients
+was active and each 0 indicating a day where the client was inactive.
--- a/mozfun/bits28/active_in_range/README.md
+++ b/mozfun/bits28/active_in_range/README.md
--- a/mozfun/bits28/days_since_seen/README.md
+++ b/mozfun/bits28/days_since_seen/README.md
@ -0,0 +1 @@
+@sql(../examples/bits28_days_since_seen.sql)
--- a/mozfun/bits28/from_string/README.md
+++ b/mozfun/bits28/from_string/README.md
--- a/mozfun/bits28/range/README.md
+++ b/mozfun/bits28/range/README.md
@ -0,0 +1 @@
+@sql(../examples/extract_week_specific_bits.sql)
--- a/mozfun/bits28/retention/README.md
+++ b/mozfun/bits28/retention/README.md
--- a/mozfun/bits28/to_dates/README.md
+++ b/mozfun/bits28/to_dates/README.md
--- a/mozfun/bits28/to_string/README.md
+++ b/mozfun/bits28/to_string/README.md
@ -0,0 +1 @@
+@sql(../examples/string_bit_patterns.sql)
--- a/mozfun/hist/examples/full_hist_extract.sql
+++ b/mozfun/hist/examples/full_hist_extract.sql
@ -0,0 +1,5 @@
+SELECT
+  mozfun.hist.extract(
+    '{"bucket_count":3,"histogram_type":4,"sum":1,"range":[1,2],"values":{"0":1,"1":0}}'
+  ).sum
+-- 1
--- a/mozfun/hist/examples/use_counter_hist_extract.sql
+++ b/mozfun/hist/examples/use_counter_hist_extract.sql
@ -0,0 +1,3 @@
+SELECT
+  mozfun.hist.extract('5').sum
+-- 5
--- a/mozfun/hist/extract/README.md
+++ b/mozfun/hist/extract/README.md
@ -0,0 +1,20 @@
+We support a variety of compact encodings as well as the classic JSON
+representation as sent in main pings.
+
+The built-in BigQuery JSON parsing functions are not powerful enough to handle
+all the logic here, so we resort to some string processing. This function could
+behave unexpectedly on poorly-formatted histogram JSON, but we expect that
+payload validation in the data pipeline should ensure that histograms are well
+formed, which gives us some flexibility.
+
+For more on desktop telemetry histogram structure, see:
+
+- https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html
+
+The compact encodings were originally proposed in:
+
+- https://docs.google.com/document/d/1k_ji_1DB6htgtXnPpMpa7gX0klm-DGV5NMY7KkvVB00/edit#
+
+@sql(../examples/full_hist_extract.sql)
+
+@sql(../examples/use_counter_hist_extract.sql)
--- a/mozfun/hist/extract/metadata.yaml
+++ b/mozfun/hist/extract/metadata.yaml
@ -1,18 +1,2 @@
 friendly_name: Extract Histogram
-description: >-
-  Return a parsed struct from a string-encoded histogram.
-
-  We support a variety of compact encodings as well as the classic JSON
-  representation as sent in main pings.
-
-  The built-in BigQuery JSON parsing functions are not powerful enough to handle
-  all the logic here, so we resort to some string processing. This function could
-  behave unexpectedly on poorly-formatted histogram JSON, but we expect that
-  payload validation in the data pipeline should ensure that histograms are well
-  formed, which gives us some flexibility.
-
-  For more on desktop telemetry histogram structure, see:
-  https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/collection/histograms.html
-
-  The compact encodings were originally proposed in:
-  https://docs.google.com/document/d/1k_ji_1DB6htgtXnPpMpa7gX0klm-DGV5NMY7KkvVB00/edit#
+description: Return a parsed struct from a string-encoded histogram.
--- a/mozfun/json/extract_int_map/README.md
+++ b/mozfun/json/extract_int_map/README.md
--- a/mozfun/map/get_key/README.md
+++ b/mozfun/map/get_key/README.md
@ -1 +0,0 @@
-## get_key
--- a/tests/data/test_docs/generated_docs/test_dataset1/udf1/README.md
+++ b/tests/data/test_docs/generated_docs/test_dataset1/udf1/README.md
@ -1,3 +1 @@
-# udf1
-
@sql(../examples/example1.sql)
--- a/tests/data/test_docs/generated_docs/test_dataset1/udf2/README.md
+++ b/tests/data/test_docs/generated_docs/test_dataset1/udf2/README.md
@ -1 +0,0 @@
-# udf2
--- a/tests/docs/test_generate_docs.py
+++ b/tests/docs/test_generate_docs.py
@ -20,7 +20,7 @@ class TestGenerateDocs:
        )
        result = load_with_examples(str(input)).strip()

-        assert result == "# udf1\n\n```sql\nSELECT\n  *\nFROM\n  test\n\n```"
+        assert result == "```sql\nSELECT\n  *\nFROM\n  test\n```"

    def test_load_with_examples_dataset(self):
        input = (
@ -33,7 +33,7 @@ class TestGenerateDocs:
        )
        result = load_with_examples(str(input)).strip()

-        assert result == "# test_dataset1\n\n```sql\nSELECT\n  *\nFROM\n  test\n\n```"
+        assert result == "# test_dataset1\n\n```sql\nSELECT\n  *\nFROM\n  test\n```"

    def test_load_with_missing_example(self, tmp_path):
        file_path = tmp_path / "ds" / "udf"
				`@ -0,0 +1 @@`
				`@sql(../examples/bits28_days_since_seen.sql)`
				`@ -0,0 +1 @@`
				`@sql(../examples/extract_week_specific_bits.sql)`