Bug 1608838: Include data collection categories as metadata (#199)

* Bug 1608838: Include data collection categories as metadata

* Add license header

* Add HISTORY entry
This commit is contained in:
Michael Droettboom 2020-07-21 18:43:01 -04:00 коммит произвёл GitHub
Родитель a89ad1a03c
Коммит 7b8bcc7414
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 264 добавлений и 2 удалений

Просмотреть файл

@ -6,6 +6,7 @@ Unreleased
----------
* Add support for JWE metric types.
* Add a `data_category` field to all metrics for specifying the type of data collected in the field.
1.25.0 (2020-07-17)
-------------------

Просмотреть файл

@ -132,6 +132,13 @@ def ping_include_client_id(
return False
def data_category_numbers(data_category: Optional[List[metrics.DataCategory]]) -> str:
if data_category is None:
return "unknown"
else:
return ", ".join(str(x.value) for x in data_category)
def output_markdown(
objs: metrics.ObjectTree, output_dir: Path, options: Dict[str, Any] = {}
) -> None:
@ -209,6 +216,7 @@ def output_markdown(
"ping_include_client_id",
lambda x: ping_include_client_id(x, custom_pings_cache),
),
("data_category_numbers", data_category_numbers),
),
)

Просмотреть файл

@ -25,6 +25,13 @@ class Lifetime(enum.Enum):
user = 2
class DataCategory(enum.Enum):
technical = 1
interaction = 2
web_activity = 3
highly_sensitive = 4
class Metric:
typename: str = "ERROR"
glean_internal_metric_cat: str = "glean.internal.metrics"
@ -48,6 +55,7 @@ class Metric:
unit: str = "",
gecko_datapoint: str = "",
no_lint: Optional[List[str]] = None,
data_categories: Optional[List[str]] = None,
_config: Optional[Dict[str, Any]] = None,
_validated: bool = False,
):
@ -75,6 +83,8 @@ class Metric:
if no_lint is None:
no_lint = []
self.no_lint = no_lint
if data_categories is not None:
self.data_categories = [getattr(DataCategory, x) for x in data_categories]
# _validated indicates whether this metric has already been jsonschema
# validated (but not any of the Python-level validation).
@ -141,6 +151,8 @@ class Metric:
d[key] = d[key].name
if isinstance(val, set):
d[key] = sorted(list(val))
if isinstance(val, list) and len(val) and isinstance(val[0], enum.Enum):
d[key] = [x.name for x in val]
del d["name"]
del d["category"]
return d

Просмотреть файл

@ -425,6 +425,52 @@ definitions:
type: string
pattern: "^[a-z_][a-z0-9_]{0,29}(\\.[a-z_][a-z0-9_]{0,29})*$"
data_categories:
title: Data collection categories
description: |
There are four data collection categories [defined
here](https://wiki.mozilla.org/Firefox/Data_Collection):
- **Category 1: Technical Data:** (`technical`) Information about the
machine or Firefox itself. Examples include OS, available memory,
crashes and errors, outcome of automated processes like updates,
safebrowsing, activation, version \#s, and buildid. This also
includes compatibility information about features and APIs used by
websites, addons, and other 3rd-party software that interact with
Firefox during usage.
- **Category 2: Interaction Data:** (`interaction`) Information about
the users direct engagement with Firefox. Examples include how many
tabs, addons, or windows a user has open; uses of specific Firefox
features; session length, scrolls and clicks; and the status of
discrete user preferences.
- **Category 3: Web activity data:** (`web_activity`) Information
about user web browsing that could be considered sensitive. Examples
include users specific web browsing history; general information
about their web browsing history (such as TLDs or categories of
webpages visited over time); and potentially certain types of
interaction data about specific webpages visited.
- **Category 4: Highly sensitive data:** (`highly_sensitive`)
Information that directly identifies a person, or if combined with
other data could identify a person. Examples include e-mail,
usernames, identifiers such as google ad id, apple id, fxaccount,
city or country (unless small ones are explicitly filtered out), or
certain cookies. It may be embedded within specific website content,
such as memory contents, dumps, captures of screen data, or DOM
data.
type: array
items:
enum:
- technical
- interaction
- web_activity
- highly_sensitive
type: string
minLength: 1
uniqueItems: true
required:
- type
- bugs

Просмотреть файл

@ -61,7 +61,7 @@ This ping includes the [client id](https://mozilla.github.io/glean/book/user/pin
{% if metrics_by_pings[ping_name] %}
The following metrics are added to the ping:
| Name | Type | Description | Data reviews | Extras | Expiration |
| Name | Type | Description | Data reviews | Extras | Expiration | [Data Category](https://wiki.mozilla.org/Firefix/Data_Collection) |
| --- | --- | --- | --- | --- | --- |
{% for metric in metrics_by_pings[ping_name] %}
| {{ metric.identifier() }} |
@ -78,6 +78,7 @@ The following metrics are added to the ping:
</ul>
{%- endif -%} |
{{- metric.expires }} |
{{- metric.data_categories|data_category_numbers }} |
{% endfor %}
{% else %}
This ping contains no metrics.
@ -85,6 +86,8 @@ This ping contains no metrics.
{% endfor %}
Data categories are [defined here](https://wiki.mozilla.org/Firefox/Data_Collection).
<!-- AUTOGENERATED BY glean_parser. DO NOT EDIT. -->
{# The rendered markdown is autogenerated, but this
Jinja2 template is not. Please file bugs! #}

Просмотреть файл

@ -34,7 +34,8 @@ def test_parser(tmpdir):
assert "is assembled out of the box by the Glean SDK." in content
# Make sure the table structure is in place
assert (
"| Name | Type | Description | Data reviews | Extras | Expiration |"
"| Name | Type | Description | Data reviews | Extras | "
+ "Expiration | [Data Category]"
in content
)
# Make sure non ASCII characters are there
@ -202,3 +203,21 @@ def test_send_if_empty_metrics(tmpdir):
with (tmpdir / "metrics.md").open("r", encoding="utf-8") as fd:
content = fd.read()
assert "Lorem ipsum dolor sit amet, consectetur adipiscing elit." in content
def test_data_categories():
event = metrics.Event(
type="event",
category="category",
name="metric",
bugs=[42],
notification_emails=["nobody@example.com"],
description="description...",
expires="never",
extra_keys={"my_extra": {"description": "an extra"}},
data_categories=["technical", "interaction"],
)
assert markdown.data_category_numbers(event.data_categories) == "1, 2"
assert markdown.data_category_numbers(None) == "unknown"

173
tools/extract_data_categories.py Executable file
Просмотреть файл

@ -0,0 +1,173 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Usage:
python extract_data_categories.py metrics.yaml
Automatically extract the data collection categories for all the metrics in a
metrics.yaml file by consulting the linked data reviews.
This script reads a metrics.yaml file, visits all of the associated data
reviews, trying to determine the associated data categories, and inserts them
(in place) to the original metrics.yaml file.
A very simple heuristic is used: to look for the question about data categories
used in all data reviews, and then find any numbers between it and the next
question. When this simple heuristic fails, comments with "!!!" are inserted in
the output as a recommendation to manually investigate and enter the data
categories.
Requirements from PyPI: BeautifulSoup4, PyYAML
"""
import dbm
import functools
import re
import sys
import time
from typing import List, Set
from urllib.request import urlopen
from bs4 import BeautifulSoup
import yaml
cache = dbm.open("bugzilla-cache.db", "c")
QUESTION = "what collection type of data do the requested measurements fall under?"
CATEGORY_MAP = {
1: "technical",
2: "interaction",
3: "web_activity",
4: "highly_sensitive",
}
def fetch_url(url: str) -> str:
"""
Fetch a web page containing a data review, caching it to avoid
over-fetching.
"""
content = cache.get(url)
if content is not None:
return content
print(f"Fetching {url}")
content = urlopen(url).read()
cache[url] = content
time.sleep(0.5)
return content
@functools.lru_cache(1000)
def parse_data_review(html: str) -> Set[int]:
"""
Parse a single data review.
"""
soup = BeautifulSoup(html, features="html.parser")
text = soup.get_text()
lines = iter(text.splitlines())
for line in lines:
if QUESTION in line.strip():
break
categories: Set[int] = set()
for line in lines:
if "?" in line:
break
categories.update(int(x) for x in re.findall("[0-9]+", line))
return categories
def categories_as_strings(categories: Set[int]) -> List[str]:
"""
From a set of numeric categories, return the strings used in a metrics.yaml
file. This may contain strings representing errors.
"""
if len(categories):
return [
CATEGORY_MAP.get(x, f"!!!UNKNOWN CATEGORY {x}")
for x in sorted(list(categories))
]
else:
return ["!!! NO DATA CATEGORIES FOUND"]
def update_lines(
lines: List[str], category_name: str, metric_name: str, data_categories: List[str]
) -> List[str]:
"""
Update the lines of a YAML file in place to include the data_categories for
the given metric, returning the lines of the result.
"""
output = []
lines_iter = iter(lines)
for line in lines_iter:
output.append(line)
if line.startswith(f"{category_name}:"):
break
for line in lines_iter:
output.append(line)
if line.startswith(f" {metric_name}:"):
break
for line in lines_iter:
output.append(line)
if line.startswith(f" data_reviews:"):
break
for line in lines_iter:
if not line.strip().startswith("- "):
output.append(" data_categories:\n")
for data_category in data_categories:
output.append(f" - {data_category}\n")
output.append(line)
break
else:
output.append(line)
for line in lines_iter:
output.append(line)
return output
def parse_yaml(yamlpath: str):
with open(yamlpath) as fd:
content = yaml.safe_load(fd)
with open(yamlpath) as fd:
lines = list(fd.readlines())
for category_name, category in content.items():
if category_name.startswith("$") or category_name == "no_lint":
continue
for metric_name, metric in category.items():
categories = set()
for data_review_url in metric["data_reviews"]:
html = fetch_url(data_review_url)
categories.update(parse_data_review(html))
lines = update_lines(
lines, category_name, metric_name, categories_as_strings(categories)
)
with open(yamlpath, "w") as fd:
for line in lines:
fd.write(line)
if __name__ == "__main__":
parse_yaml(sys.argv[-1])