Bug 1608838: Include data collection categories as metadata (#199)

* Bug 1608838: Include data collection categories as metadata * Add license header * Add HISTORY entry
2020-07-21 18:43:01 -04:00 · 2020-07-21 18:43:01 -04:00 · 7b8bcc7414
--- a/HISTORY.rst
+++ b/HISTORY.rst
@ -6,6 +6,7 @@ Unreleased
 ----------

 * Add support for JWE metric types.
+* Add a `data_category` field to all metrics for specifying the type of data collected in the field.

 1.25.0 (2020-07-17)
 -------------------
--- a/glean_parser/markdown.py
+++ b/glean_parser/markdown.py
@ -132,6 +132,13 @@ def ping_include_client_id(
        return False


+def data_category_numbers(data_category: Optional[List[metrics.DataCategory]]) -> str:
+    if data_category is None:
+        return "unknown"
+    else:
+        return ", ".join(str(x.value) for x in data_category)
+
+
 def output_markdown(
    objs: metrics.ObjectTree, output_dir: Path, options: Dict[str, Any] = {}
 ) -> None:
@ -209,6 +216,7 @@ def output_markdown(
                "ping_include_client_id",
                lambda x: ping_include_client_id(x, custom_pings_cache),
            ),
+            ("data_category_numbers", data_category_numbers),
        ),
    )

--- a/glean_parser/metrics.py
+++ b/glean_parser/metrics.py
@ -25,6 +25,13 @@ class Lifetime(enum.Enum):
    user = 2


+class DataCategory(enum.Enum):
+    technical = 1
+    interaction = 2
+    web_activity = 3
+    highly_sensitive = 4
+
+
 class Metric:
    typename: str = "ERROR"
    glean_internal_metric_cat: str = "glean.internal.metrics"
@ -48,6 +55,7 @@ class Metric:
        unit: str = "",
        gecko_datapoint: str = "",
        no_lint: Optional[List[str]] = None,
+        data_categories: Optional[List[str]] = None,
        _config: Optional[Dict[str, Any]] = None,
        _validated: bool = False,
    ):
@ -75,6 +83,8 @@ class Metric:
        if no_lint is None:
            no_lint = []
        self.no_lint = no_lint
+        if data_categories is not None:
+            self.data_categories = [getattr(DataCategory, x) for x in data_categories]

        # _validated indicates whether this metric has already been jsonschema
        # validated (but not any of the Python-level validation).
@ -141,6 +151,8 @@ class Metric:
                d[key] = d[key].name
            if isinstance(val, set):
                d[key] = sorted(list(val))
+            if isinstance(val, list) and len(val) and isinstance(val[0], enum.Enum):
+                d[key] = [x.name for x in val]
        del d["name"]
        del d["category"]
        return d
--- a/glean_parser/schemas/metrics.1-0-0.schema.yaml
+++ b/glean_parser/schemas/metrics.1-0-0.schema.yaml
@ -425,6 +425,52 @@ definitions:
        type: string
        pattern: "^[a-z_][a-z0-9_]{0,29}(\\.[a-z_][a-z0-9_]{0,29})*$"

+      data_categories:
+        title: Data collection categories
+        description: |
+          There are four data collection categories [defined
+          here](https://wiki.mozilla.org/Firefox/Data_Collection):
+
+          - **Category 1: Technical Data:** (`technical`) Information about the
+            machine or Firefox itself. Examples include OS, available memory,
+            crashes and errors, outcome of automated processes like updates,
+            safebrowsing, activation, version \#s, and buildid. This also
+            includes compatibility information about features and APIs used by
+            websites, addons, and other 3rd-party software that interact with
+            Firefox during usage.
+
+          - **Category 2: Interaction Data:** (`interaction`) Information about
+            the user’s direct engagement with Firefox. Examples include how many
+            tabs, addons, or windows a user has open; uses of specific Firefox
+            features; session length, scrolls and clicks; and the status of
+            discrete user preferences.
+
+          - **Category 3: Web activity data:** (`web_activity`) Information
+            about user web browsing that could be considered sensitive. Examples
+            include users’ specific web browsing history; general information
+            about their web browsing history (such as TLDs or categories of
+            webpages visited over time); and potentially certain types of
+            interaction data about specific webpages visited.
+
+          - **Category 4: Highly sensitive data:** (`highly_sensitive`)
+            Information that directly identifies a person, or if combined with
+            other data could identify a person. Examples include e-mail,
+            usernames, identifiers such as google ad id, apple id, fxaccount,
+            city or country (unless small ones are explicitly filtered out), or
+            certain cookies. It may be embedded within specific website content,
+            such as memory contents, dumps, captures of screen data, or DOM
+            data.
+        type: array
+        items:
+          enum:
+            - technical
+            - interaction
+            - web_activity
+            - highly_sensitive
+          type: string
+        minLength: 1
+        uniqueItems: true
+
    required:
      - type
      - bugs
--- a/glean_parser/templates/markdown.jinja2
+++ b/glean_parser/templates/markdown.jinja2
@ -61,7 +61,7 @@ This ping includes the [client id](https://mozilla.github.io/glean/book/user/pin
 {% if metrics_by_pings[ping_name] %}
 The following metrics are added to the ping:

-| Name | Type | Description | Data reviews | Extras | Expiration |
+| Name | Type | Description | Data reviews | Extras | Expiration | [Data Category](https://wiki.mozilla.org/Firefix/Data_Collection) |
 | --- | --- | --- | --- | --- | --- |
 {% for metric in metrics_by_pings[ping_name] %}
 | {{ metric.identifier() }} |
@ -78,6 +78,7 @@ The following metrics are added to the ping:
 </ul>
 {%- endif -%} |
 {{- metric.expires }} |
+{{- metric.data_categories|data_category_numbers }} |
 {% endfor %}
 {% else %}
 This ping contains no metrics.
@ -85,6 +86,8 @@ This ping contains no metrics.

 {% endfor %}

+Data categories are [defined here](https://wiki.mozilla.org/Firefox/Data_Collection).
+
 <!-- AUTOGENERATED BY glean_parser.  DO NOT EDIT. -->
 {# The rendered markdown is autogenerated, but this
 Jinja2 template is not. Please file bugs! #}
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@ -34,7 +34,8 @@ def test_parser(tmpdir):
        assert "is assembled out of the box by the Glean SDK." in content
        # Make sure the table structure is in place
        assert (
-            "| Name | Type | Description | Data reviews | Extras | Expiration |"
+            "| Name | Type | Description | Data reviews | Extras | "
+            + "Expiration | [Data Category]"
            in content
        )
        # Make sure non ASCII characters are there
@ -202,3 +203,21 @@ def test_send_if_empty_metrics(tmpdir):
    with (tmpdir / "metrics.md").open("r", encoding="utf-8") as fd:
        content = fd.read()
        assert "Lorem ipsum dolor sit amet, consectetur adipiscing elit." in content
+
+
+def test_data_categories():
+    event = metrics.Event(
+        type="event",
+        category="category",
+        name="metric",
+        bugs=[42],
+        notification_emails=["nobody@example.com"],
+        description="description...",
+        expires="never",
+        extra_keys={"my_extra": {"description": "an extra"}},
+        data_categories=["technical", "interaction"],
+    )
+
+    assert markdown.data_category_numbers(event.data_categories) == "1, 2"
+
+    assert markdown.data_category_numbers(None) == "unknown"
--- a/tools/extract_data_categories.py
+++ b/tools/extract_data_categories.py
@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+
+# -*- coding: utf-8 -*-
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+Usage:
+   python extract_data_categories.py metrics.yaml
+
+Automatically extract the data collection categories for all the metrics in a
+metrics.yaml file by consulting the linked data reviews.
+
+This script reads a metrics.yaml file, visits all of the associated data
+reviews, trying to determine the associated data categories, and inserts them
+(in place) to the original metrics.yaml file.
+
+A very simple heuristic is used: to look for the question about data categories
+used in all data reviews, and then find any numbers between it and the next
+question. When this simple heuristic fails, comments with "!!!" are inserted in
+the output as a recommendation to manually investigate and enter the data
+categories.
+
+Requirements from PyPI: BeautifulSoup4, PyYAML
+"""
+
+import dbm
+import functools
+import re
+import sys
+import time
+from typing import List, Set
+from urllib.request import urlopen
+
+
+from bs4 import BeautifulSoup
+import yaml
+
+
+cache = dbm.open("bugzilla-cache.db", "c")
+
+
+QUESTION = "what collection type of data do the requested measurements fall under?"
+
+
+CATEGORY_MAP = {
+    1: "technical",
+    2: "interaction",
+    3: "web_activity",
+    4: "highly_sensitive",
+}
+
+
+def fetch_url(url: str) -> str:
+    """
+    Fetch a web page containing a data review, caching it to avoid
+    over-fetching.
+    """
+    content = cache.get(url)
+    if content is not None:
+        return content
+
+    print(f"Fetching {url}")
+    content = urlopen(url).read()
+    cache[url] = content
+    time.sleep(0.5)
+    return content
+
+
+@functools.lru_cache(1000)
+def parse_data_review(html: str) -> Set[int]:
+    """
+    Parse a single data review.
+    """
+    soup = BeautifulSoup(html, features="html.parser")
+    text = soup.get_text()
+    lines = iter(text.splitlines())
+    for line in lines:
+        if QUESTION in line.strip():
+            break
+
+    categories: Set[int] = set()
+    for line in lines:
+        if "?" in line:
+            break
+        categories.update(int(x) for x in re.findall("[0-9]+", line))
+
+    return categories
+
+
+def categories_as_strings(categories: Set[int]) -> List[str]:
+    """
+    From a set of numeric categories, return the strings used in a metrics.yaml
+    file. This may contain strings representing errors.
+    """
+    if len(categories):
+        return [
+            CATEGORY_MAP.get(x, f"!!!UNKNOWN CATEGORY {x}")
+            for x in sorted(list(categories))
+        ]
+    else:
+        return ["!!! NO DATA CATEGORIES FOUND"]
+
+
+def update_lines(
+    lines: List[str], category_name: str, metric_name: str, data_categories: List[str]
+) -> List[str]:
+    """
+    Update the lines of a YAML file in place to include the data_categories for
+    the given metric, returning the lines of the result.
+    """
+    output = []
+    lines_iter = iter(lines)
+
+    for line in lines_iter:
+        output.append(line)
+        if line.startswith(f"{category_name}:"):
+            break
+
+    for line in lines_iter:
+        output.append(line)
+        if line.startswith(f"  {metric_name}:"):
+            break
+
+    for line in lines_iter:
+        output.append(line)
+        if line.startswith(f"    data_reviews:"):
+            break
+
+    for line in lines_iter:
+        if not line.strip().startswith("- "):
+            output.append("    data_categories:\n")
+            for data_category in data_categories:
+                output.append(f"      - {data_category}\n")
+            output.append(line)
+            break
+        else:
+            output.append(line)
+
+    for line in lines_iter:
+        output.append(line)
+
+    return output
+
+
+def parse_yaml(yamlpath: str):
+    with open(yamlpath) as fd:
+        content = yaml.safe_load(fd)
+
+    with open(yamlpath) as fd:
+        lines = list(fd.readlines())
+
+    for category_name, category in content.items():
+        if category_name.startswith("$") or category_name == "no_lint":
+            continue
+        for metric_name, metric in category.items():
+            categories = set()
+            for data_review_url in metric["data_reviews"]:
+                html = fetch_url(data_review_url)
+                categories.update(parse_data_review(html))
+            lines = update_lines(
+                lines, category_name, metric_name, categories_as_strings(categories)
+            )
+
+    with open(yamlpath, "w") as fd:
+        for line in lines:
+            fd.write(line)
+
+
+if __name__ == "__main__":
+    parse_yaml(sys.argv[-1])