Bug 1608838: Include data collection categories as metadata (#199)
* Bug 1608838: Include data collection categories as metadata * Add license header * Add HISTORY entry
This commit is contained in:
Родитель
a89ad1a03c
Коммит
7b8bcc7414
|
@ -6,6 +6,7 @@ Unreleased
|
|||
----------
|
||||
|
||||
* Add support for JWE metric types.
|
||||
* Add a `data_category` field to all metrics for specifying the type of data collected in the field.
|
||||
|
||||
1.25.0 (2020-07-17)
|
||||
-------------------
|
||||
|
|
|
@ -132,6 +132,13 @@ def ping_include_client_id(
|
|||
return False
|
||||
|
||||
|
||||
def data_category_numbers(data_category: Optional[List[metrics.DataCategory]]) -> str:
|
||||
if data_category is None:
|
||||
return "unknown"
|
||||
else:
|
||||
return ", ".join(str(x.value) for x in data_category)
|
||||
|
||||
|
||||
def output_markdown(
|
||||
objs: metrics.ObjectTree, output_dir: Path, options: Dict[str, Any] = {}
|
||||
) -> None:
|
||||
|
@ -209,6 +216,7 @@ def output_markdown(
|
|||
"ping_include_client_id",
|
||||
lambda x: ping_include_client_id(x, custom_pings_cache),
|
||||
),
|
||||
("data_category_numbers", data_category_numbers),
|
||||
),
|
||||
)
|
||||
|
||||
|
|
|
@ -25,6 +25,13 @@ class Lifetime(enum.Enum):
|
|||
user = 2
|
||||
|
||||
|
||||
class DataCategory(enum.Enum):
|
||||
technical = 1
|
||||
interaction = 2
|
||||
web_activity = 3
|
||||
highly_sensitive = 4
|
||||
|
||||
|
||||
class Metric:
|
||||
typename: str = "ERROR"
|
||||
glean_internal_metric_cat: str = "glean.internal.metrics"
|
||||
|
@ -48,6 +55,7 @@ class Metric:
|
|||
unit: str = "",
|
||||
gecko_datapoint: str = "",
|
||||
no_lint: Optional[List[str]] = None,
|
||||
data_categories: Optional[List[str]] = None,
|
||||
_config: Optional[Dict[str, Any]] = None,
|
||||
_validated: bool = False,
|
||||
):
|
||||
|
@ -75,6 +83,8 @@ class Metric:
|
|||
if no_lint is None:
|
||||
no_lint = []
|
||||
self.no_lint = no_lint
|
||||
if data_categories is not None:
|
||||
self.data_categories = [getattr(DataCategory, x) for x in data_categories]
|
||||
|
||||
# _validated indicates whether this metric has already been jsonschema
|
||||
# validated (but not any of the Python-level validation).
|
||||
|
@ -141,6 +151,8 @@ class Metric:
|
|||
d[key] = d[key].name
|
||||
if isinstance(val, set):
|
||||
d[key] = sorted(list(val))
|
||||
if isinstance(val, list) and len(val) and isinstance(val[0], enum.Enum):
|
||||
d[key] = [x.name for x in val]
|
||||
del d["name"]
|
||||
del d["category"]
|
||||
return d
|
||||
|
|
|
@ -425,6 +425,52 @@ definitions:
|
|||
type: string
|
||||
pattern: "^[a-z_][a-z0-9_]{0,29}(\\.[a-z_][a-z0-9_]{0,29})*$"
|
||||
|
||||
data_categories:
|
||||
title: Data collection categories
|
||||
description: |
|
||||
There are four data collection categories [defined
|
||||
here](https://wiki.mozilla.org/Firefox/Data_Collection):
|
||||
|
||||
- **Category 1: Technical Data:** (`technical`) Information about the
|
||||
machine or Firefox itself. Examples include OS, available memory,
|
||||
crashes and errors, outcome of automated processes like updates,
|
||||
safebrowsing, activation, version \#s, and buildid. This also
|
||||
includes compatibility information about features and APIs used by
|
||||
websites, addons, and other 3rd-party software that interact with
|
||||
Firefox during usage.
|
||||
|
||||
- **Category 2: Interaction Data:** (`interaction`) Information about
|
||||
the user’s direct engagement with Firefox. Examples include how many
|
||||
tabs, addons, or windows a user has open; uses of specific Firefox
|
||||
features; session length, scrolls and clicks; and the status of
|
||||
discrete user preferences.
|
||||
|
||||
- **Category 3: Web activity data:** (`web_activity`) Information
|
||||
about user web browsing that could be considered sensitive. Examples
|
||||
include users’ specific web browsing history; general information
|
||||
about their web browsing history (such as TLDs or categories of
|
||||
webpages visited over time); and potentially certain types of
|
||||
interaction data about specific webpages visited.
|
||||
|
||||
- **Category 4: Highly sensitive data:** (`highly_sensitive`)
|
||||
Information that directly identifies a person, or if combined with
|
||||
other data could identify a person. Examples include e-mail,
|
||||
usernames, identifiers such as google ad id, apple id, fxaccount,
|
||||
city or country (unless small ones are explicitly filtered out), or
|
||||
certain cookies. It may be embedded within specific website content,
|
||||
such as memory contents, dumps, captures of screen data, or DOM
|
||||
data.
|
||||
type: array
|
||||
items:
|
||||
enum:
|
||||
- technical
|
||||
- interaction
|
||||
- web_activity
|
||||
- highly_sensitive
|
||||
type: string
|
||||
minLength: 1
|
||||
uniqueItems: true
|
||||
|
||||
required:
|
||||
- type
|
||||
- bugs
|
||||
|
|
|
@ -61,7 +61,7 @@ This ping includes the [client id](https://mozilla.github.io/glean/book/user/pin
|
|||
{% if metrics_by_pings[ping_name] %}
|
||||
The following metrics are added to the ping:
|
||||
|
||||
| Name | Type | Description | Data reviews | Extras | Expiration |
|
||||
| Name | Type | Description | Data reviews | Extras | Expiration | [Data Category](https://wiki.mozilla.org/Firefix/Data_Collection) |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
{% for metric in metrics_by_pings[ping_name] %}
|
||||
| {{ metric.identifier() }} |
|
||||
|
@ -78,6 +78,7 @@ The following metrics are added to the ping:
|
|||
</ul>
|
||||
{%- endif -%} |
|
||||
{{- metric.expires }} |
|
||||
{{- metric.data_categories|data_category_numbers }} |
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
This ping contains no metrics.
|
||||
|
@ -85,6 +86,8 @@ This ping contains no metrics.
|
|||
|
||||
{% endfor %}
|
||||
|
||||
Data categories are [defined here](https://wiki.mozilla.org/Firefox/Data_Collection).
|
||||
|
||||
<!-- AUTOGENERATED BY glean_parser. DO NOT EDIT. -->
|
||||
{# The rendered markdown is autogenerated, but this
|
||||
Jinja2 template is not. Please file bugs! #}
|
||||
|
|
|
@ -34,7 +34,8 @@ def test_parser(tmpdir):
|
|||
assert "is assembled out of the box by the Glean SDK." in content
|
||||
# Make sure the table structure is in place
|
||||
assert (
|
||||
"| Name | Type | Description | Data reviews | Extras | Expiration |"
|
||||
"| Name | Type | Description | Data reviews | Extras | "
|
||||
+ "Expiration | [Data Category]"
|
||||
in content
|
||||
)
|
||||
# Make sure non ASCII characters are there
|
||||
|
@ -202,3 +203,21 @@ def test_send_if_empty_metrics(tmpdir):
|
|||
with (tmpdir / "metrics.md").open("r", encoding="utf-8") as fd:
|
||||
content = fd.read()
|
||||
assert "Lorem ipsum dolor sit amet, consectetur adipiscing elit." in content
|
||||
|
||||
|
||||
def test_data_categories():
|
||||
event = metrics.Event(
|
||||
type="event",
|
||||
category="category",
|
||||
name="metric",
|
||||
bugs=[42],
|
||||
notification_emails=["nobody@example.com"],
|
||||
description="description...",
|
||||
expires="never",
|
||||
extra_keys={"my_extra": {"description": "an extra"}},
|
||||
data_categories=["technical", "interaction"],
|
||||
)
|
||||
|
||||
assert markdown.data_category_numbers(event.data_categories) == "1, 2"
|
||||
|
||||
assert markdown.data_category_numbers(None) == "unknown"
|
||||
|
|
|
@ -0,0 +1,173 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""
|
||||
Usage:
|
||||
python extract_data_categories.py metrics.yaml
|
||||
|
||||
Automatically extract the data collection categories for all the metrics in a
|
||||
metrics.yaml file by consulting the linked data reviews.
|
||||
|
||||
This script reads a metrics.yaml file, visits all of the associated data
|
||||
reviews, trying to determine the associated data categories, and inserts them
|
||||
(in place) to the original metrics.yaml file.
|
||||
|
||||
A very simple heuristic is used: to look for the question about data categories
|
||||
used in all data reviews, and then find any numbers between it and the next
|
||||
question. When this simple heuristic fails, comments with "!!!" are inserted in
|
||||
the output as a recommendation to manually investigate and enter the data
|
||||
categories.
|
||||
|
||||
Requirements from PyPI: BeautifulSoup4, PyYAML
|
||||
"""
|
||||
|
||||
import dbm
|
||||
import functools
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Set
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import yaml
|
||||
|
||||
|
||||
cache = dbm.open("bugzilla-cache.db", "c")
|
||||
|
||||
|
||||
QUESTION = "what collection type of data do the requested measurements fall under?"
|
||||
|
||||
|
||||
CATEGORY_MAP = {
|
||||
1: "technical",
|
||||
2: "interaction",
|
||||
3: "web_activity",
|
||||
4: "highly_sensitive",
|
||||
}
|
||||
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""
|
||||
Fetch a web page containing a data review, caching it to avoid
|
||||
over-fetching.
|
||||
"""
|
||||
content = cache.get(url)
|
||||
if content is not None:
|
||||
return content
|
||||
|
||||
print(f"Fetching {url}")
|
||||
content = urlopen(url).read()
|
||||
cache[url] = content
|
||||
time.sleep(0.5)
|
||||
return content
|
||||
|
||||
|
||||
@functools.lru_cache(1000)
|
||||
def parse_data_review(html: str) -> Set[int]:
|
||||
"""
|
||||
Parse a single data review.
|
||||
"""
|
||||
soup = BeautifulSoup(html, features="html.parser")
|
||||
text = soup.get_text()
|
||||
lines = iter(text.splitlines())
|
||||
for line in lines:
|
||||
if QUESTION in line.strip():
|
||||
break
|
||||
|
||||
categories: Set[int] = set()
|
||||
for line in lines:
|
||||
if "?" in line:
|
||||
break
|
||||
categories.update(int(x) for x in re.findall("[0-9]+", line))
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def categories_as_strings(categories: Set[int]) -> List[str]:
|
||||
"""
|
||||
From a set of numeric categories, return the strings used in a metrics.yaml
|
||||
file. This may contain strings representing errors.
|
||||
"""
|
||||
if len(categories):
|
||||
return [
|
||||
CATEGORY_MAP.get(x, f"!!!UNKNOWN CATEGORY {x}")
|
||||
for x in sorted(list(categories))
|
||||
]
|
||||
else:
|
||||
return ["!!! NO DATA CATEGORIES FOUND"]
|
||||
|
||||
|
||||
def update_lines(
|
||||
lines: List[str], category_name: str, metric_name: str, data_categories: List[str]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Update the lines of a YAML file in place to include the data_categories for
|
||||
the given metric, returning the lines of the result.
|
||||
"""
|
||||
output = []
|
||||
lines_iter = iter(lines)
|
||||
|
||||
for line in lines_iter:
|
||||
output.append(line)
|
||||
if line.startswith(f"{category_name}:"):
|
||||
break
|
||||
|
||||
for line in lines_iter:
|
||||
output.append(line)
|
||||
if line.startswith(f" {metric_name}:"):
|
||||
break
|
||||
|
||||
for line in lines_iter:
|
||||
output.append(line)
|
||||
if line.startswith(f" data_reviews:"):
|
||||
break
|
||||
|
||||
for line in lines_iter:
|
||||
if not line.strip().startswith("- "):
|
||||
output.append(" data_categories:\n")
|
||||
for data_category in data_categories:
|
||||
output.append(f" - {data_category}\n")
|
||||
output.append(line)
|
||||
break
|
||||
else:
|
||||
output.append(line)
|
||||
|
||||
for line in lines_iter:
|
||||
output.append(line)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def parse_yaml(yamlpath: str):
|
||||
with open(yamlpath) as fd:
|
||||
content = yaml.safe_load(fd)
|
||||
|
||||
with open(yamlpath) as fd:
|
||||
lines = list(fd.readlines())
|
||||
|
||||
for category_name, category in content.items():
|
||||
if category_name.startswith("$") or category_name == "no_lint":
|
||||
continue
|
||||
for metric_name, metric in category.items():
|
||||
categories = set()
|
||||
for data_review_url in metric["data_reviews"]:
|
||||
html = fetch_url(data_review_url)
|
||||
categories.update(parse_data_review(html))
|
||||
lines = update_lines(
|
||||
lines, category_name, metric_name, categories_as_strings(categories)
|
||||
)
|
||||
|
||||
with open(yamlpath, "w") as fd:
|
||||
for line in lines:
|
||||
fd.write(line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_yaml(sys.argv[-1])
|
Загрузка…
Ссылка в новой задаче